Rev. | 0bc36b41d1b3964a397b5e569e9999af5bdd776c |
---|---|
Tamanho | 2,176 bytes |
Hora | 2021-10-01 17:38:14 |
Autor | Lorenzo Isella |
Mensagem de Log | A simple script to perform some basic text mining. |
rm(list=ls())
library(tidyverse)
library(stringr)
library(stringi)
library(openxlsx)
library(janitor)
library(tidytext)
source("/home/lorenzo/myprojects-hg/R-codes/stat_lib.R")
df_ini <- read_excel("etsi_wax_melts.xlsx")
df <- df_ini %>%
mutate(across(everything(), ~ remove_special_char(.x, " "))) %>%
filter(complete.cases(.))
titles <- df %>%
select(title)
descriptions <- df %>%
select(description)
tidy_titles <- titles %>%
unnest_tokens(word, title)
tidy_descriptions <- descriptions %>%
unnest_tokens(word, description)
data(stop_words)
tidy_titles <- tidy_titles %>%
anti_join(stop_words)
tidy_descriptions <- tidy_descriptions %>%
anti_join(stop_words)
word_count_titles <- tidy_titles %>%
count(word, sort = TRUE)
word_count_descriptions <- tidy_descriptions %>%
count(word, sort = TRUE)
### work on titles
titles_bigrams <- titles %>%
unnest_tokens(bigram, title, token = "ngrams", n = 2)
titles_bigrams_separated <- titles_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
titles_bigrams_filtered <- titles_bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
titles_bigram_counts <- titles_bigrams_filtered %>%
count(word1, word2, sort = TRUE)
### work on descriptions
descriptions_bigrams <- descriptions %>%
unnest_tokens(bigram, description, token = "ngrams", n = 2)
descriptions_bigrams_separated <- descriptions_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
descriptions_bigrams_filtered <- descriptions_bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
descriptions_bigram_counts <- descriptions_bigrams_filtered %>%
count(word1, word2, sort = TRUE)
trigram_description <- descriptions %>%
unnest_tokens(trigram, description, token = "ngrams", n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word3 %in% stop_words$word) %>%
count(word1, word2, word3, sort = TRUE)
print("So far so good")