In-class_ex5

Author

You Ting QUEK

Published

May 11, 2024

Modified

May 13, 2024

pacman::p_load(tidyverse, readtext, quanteda, tidytext, jsonlite, DT)
print(paste0("Checking directory: ", "/data/articles/*"))
[1] "Checking directory: /data/articles/*"
print(list.files("/data/articles/*"))
character(0)
text_data = readtext("data/articles/*")
glimpse(text_data)
Rows: 338
Columns: 2
$ doc_id <chr> "Alvarez PLC__0__0__Haacklee Herald.txt", "Alvarez PLC__0__0__L…
$ text   <chr> "Marine Sanctuary Aid Boosts Alvarez PLC's Sustainable Fishing …
usenet_words <- text_data %>%
  unnest_tokens(word, text) %>%
  filter(str_detect(word, "[a-z']$"),
         !word %in% stop_words$word)
usenet_words %>%
  count(word, sort = TRUE)
readtext object consisting of 3261 documents and 0 docvars.
# A data frame: 3,261 × 3
  word             n text     
  <chr>        <int> <chr>    
1 fishing       2177 "\"\"..."
2 sustainable   1525 "\"\"..."
3 company       1036 "\"\"..."
4 practices      838 "\"\"..."
5 industry       715 "\"\"..."
6 transactions   696 "\"\"..."
# ℹ 3,255 more rows
text_data_splitted <- text_data %>%
  separate_wider_delim("doc_id",
                       delim = "__0__",
                       names = c("X","Y"),
                       too_few = "align_end")