pacman::p_load(tidyverse, readtext, quanteda, tidytext, jsonlite, DT)In-class_ex5
print(paste0("Checking directory: ", "/data/articles/*"))[1] "Checking directory: /data/articles/*"
print(list.files("/data/articles/*"))character(0)
text_data = readtext("data/articles/*")
glimpse(text_data)Rows: 338
Columns: 2
$ doc_id <chr> "Alvarez PLC__0__0__Haacklee Herald.txt", "Alvarez PLC__0__0__L…
$ text <chr> "Marine Sanctuary Aid Boosts Alvarez PLC's Sustainable Fishing …
usenet_words <- text_data %>%
unnest_tokens(word, text) %>%
filter(str_detect(word, "[a-z']$"),
!word %in% stop_words$word)usenet_words %>%
count(word, sort = TRUE)readtext object consisting of 3261 documents and 0 docvars.
# A data frame: 3,261 × 3
word n text
<chr> <int> <chr>
1 fishing 2177 "\"\"..."
2 sustainable 1525 "\"\"..."
3 company 1036 "\"\"..."
4 practices 838 "\"\"..."
5 industry 715 "\"\"..."
6 transactions 696 "\"\"..."
# ℹ 3,255 more rows
text_data_splitted <- text_data %>%
separate_wider_delim("doc_id",
delim = "__0__",
names = c("X","Y"),
too_few = "align_end")