j'ai écrit un fichier illustré ci-dessous (à partir du cours Coursera infâme et au-delà) et il m'a bien servi. Je ne sais pas si quelque chose a changé, mais cela ne semble pas fonctionner maintenant et je n'ai rien changé.Programme d'analyse de texte utilisé au travail, maintenant il ne pas
La première chose qui ne semble pas fonctionner est la boucle pour supprimer les caractères spéciaux.
Ensuite, quand je traite comme un plan texte Doc, le nuage de mot ne semble pas vouloir travailler.
Enfin, les fonctions de Tokenizer produisent le même graphique, essentiellement les simples mots fréquemment utilisés contre les ngrams comme programmé. Signification chaque ngram est juste de produire le même graphique, les mots les plus fréquemment utilisés par rapport au ngram de 2,3,4 mots, etc ...
Je ne sais pas si les mises à jour des paquets ou des mises à jour de R sont à l'origine de cette.
Des pensées?
#Set working directory and read file
cname <- file.path("c:/texts")
cname
dir(cname)
setwd("c:/texts")
library("RColorBrewer")
library("tm")
library("knitr")
library("devtools")
library("plyr")
library("ggplot2")
library("wordcloud")
library("rJava")
library("RWeka")
library("stringi")
library("XLConnect")
library("XLConnectJars")
df<- readWorksheetFromFile("uars.xlsx", sheet=1, startRow=1)
df1 <- df[df$Business %in% "FRAUD", ]
#Load the R package for text mining and then load your texts into R.
library(tm)
docs <- Corpus(VectorSource(df1))
summary(docs)
#read your documents in the R terminal using
inspect(docs)
#Preprocessing
#Removing punctuation
docs <- tm_map(docs, removePunctuation)
# remove special characters.
for(j in seq(docs))
{
docs[[j]] <- gsub("/", " ", docs[[j]])
docs[[j]] <- gsub("@", " ", docs[[j]])
docs[[j]] <- gsub("\\|", " ", docs[[j]])
}
#Removing numbers:
docs <- tm_map(docs, removeNumbers)
#Converting to lowercase:
docs <- tm_map(docs, tolower)
#Removing "stopwords" (common words) that usually have no analytic value
docs <- tm_map(docs, removeWords, c(stopwords("english"), "bank", "account", "customer", "transactions", "sent", "received", "company",
"wire", "wires", "payment", "payments", "wells", "fargo", "transaction", "fraud", "wholesale", "wholesal", "uar", "email"))
#Removing common word endings (e.g., "ing", "es", "s")
library(SnowballC)
docs <- tm_map(docs, stemDocument)
#Stripping unnecesary whitespace from your documents:
docs <- tm_map(docs, stripWhitespace)
#treat your preprocessed documents as text documents.
docs <- tm_map(docs, PlainTextDocument)
#Stage the Data
#To proceed, create a document term matrix
dtm <- DocumentTermMatrix(docs)
dtm
inspect(dtm)
#transpose of this matrix
tdm <- TermDocumentMatrix(docs)
tdm
##TCorpus <- tm_map(TCorpus, removeWords, badWords)
wordcloud(docs, scale=c(3,0.5), min.freq=5, max.words=100, random.order=TRUE,
rot.per=0.5, colors=brewer.pal(8, "Set1"), use.r.layout=FALSE)
#Tokenizer functions
bigram <- function(x) NGramTokenizer(x, Weka_control(min=2, max=2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min=3, max=3))
quadgram <- function(x) NGramTokenizer(x, Weka_control(min=4, max=4))
fivegram <- function(x) NGramTokenizer (x, Weka_control(min=5, max=5))
sixgram <- function(x) NGramTokenizer (x, Weka_control(min=6, max=6))
#Word/phrase count function
freq_df <- function(tdm){
# Helper function to tabulate frequency
freq <- sort(rowSums(as.matrix(tdm)), decreasing=TRUE)
freq_df <- data.frame(word=names(freq), freq=freq)
return(freq_df)
}
#Creating the n-grams
corpus.unigram <- TermDocumentMatrix(docs)
corpus.unigram <- removeSparseTerms(corpus.unigram, 0.99)
corpus.unigram.freq <- freq_df(corpus.unigram)
corpus.bigram <- TermDocumentMatrix(docs, control=list(tokenize=bigram))
corpus.bigram <- removeSparseTerms(corpus.bigram, 0.999)
corpus.bigram.freq <- freq_df(corpus.bigram)
corpus.trigram <- TermDocumentMatrix(docs, control=list(tokenize=trigram))
corpus.trigram <- removeSparseTerms(corpus.trigram, 0.99)
corpus.trigram.freq <- freq_df(corpus.trigram)
corpus.quadgram <- TermDocumentMatrix(docs, control=list(tokenize=quadgram))
corpus.quadgram <- removeSparseTerms(corpus.quadgram, 0.9999)
corpus.quadgram.freq <- freq_df(corpus.quadgram)
corpus.fivegram <- TermDocumentMatrix(docs, control=list(tokenize=fivegram))
corpus.fivegram <- removeSparseTerms(corpus.fivegram, 0.9999)
corpus.fivegram.freq <- freq_df(corpus.fivegram)
corpus.sixgram <- TermDocumentMatrix(docs, control=list(tokenize=sixgram))
corpus.sixgram <- removeSparseTerms(corpus.sixgram, 0.9999)
corpus.sixgram.freq <- freq_df(corpus.sixgram)
top_50 <- function(df1, title, color) {
ggplot(df[1:50,], aes(x = seq(1:50), y = freq)) +
geom_bar(stat = "identity", fill = color, colour = "black", width = 0.80) +
coord_cartesian(xlim = c(0, 51)) +
labs(title = title) +
xlab("Words") +
ylab("Count") +
scale_x_continuous(breaks = seq(1, 50, by = 1), labels = df$word[1:50]) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
}
top_50(corpus.unigram.freq,"Top 50 words","green")
top_50(corpus.bigram.freq,"Top 2 word combos","yellow")
top_50(corpus.trigram.freq,"Top 3 word combos","orange")
top_50(corpus.quadgram.freq,"Top 4 word combos","red")
top_50(corpus.fivegram.freq,"Top 5 word combos","blue")
top_50(corpus.sixgram.freq,"Top 6 word combos","purple")