The non-scholarly write up – Wordclouds in R : Oh, The possibilities!

Standard

Okay, so I am stoked to report that I can now build them pretty wordclouds ! I am even more pleased with how easy the process is. There’s a whole array of plots you can play around with, including :

Commonality Cloud : Allows you to view words common to both corpora

Comparison Cloud: Allows you view words which are not common to both the corpora

Polarized Plot: A better take on the commonality cloud, allowing you to tell which corpra                                   has a greater concentration of a particular word.

Visualized word Network : shows the network of words associated with a main word.

Let’s jump right into it.

Step 1: Load libraries

require("tm") # the text mining package
require("qdap") # for qdap package's cleaning functions
require("twitteR") # to connect to twitter and extract tweets
require("plotrix") # for the pyramid plot

Step 2: Read in your choice of tweets

After connecting to twitter, I downloaded 5000 tweets each found from a search of the key words “hillary” and “trump”. And this was minutes after the US elections 2016 results were declared . Twitter has never been so lit!

hillary<-searchTwitter("hillary",n=5000,lang = "en")
trump<- searchTwitter("trump",n=5000,lang="en")

 

Step 3: Write and apply functions to perform data transformation and                       cleaning

a) Function to extract text from the tweets which get downloaded in the list form.We do this using  getText  which is an accessor method.

convert_to_text <- function(x){
 x$getText()
 }

b) Function to process our tweets to remove duplicates and  urls.

replacefunc <- function(x){
 gsub("https://(.*)", "", x)
 
}


replace_dup <- function(x){
 gsub("^(rt|RT)(.*)", "", x)
 
}

c) Function to further clean the character vector , for example, to remove brackets, replace abbreviations and symbols with their word equivalents and  contractions with their fully expanded versions.

clean_qdap <- function(x){
 x<- bracketX(x)
 x<- replace_abbreviation(x)
 x<- replace_contraction(x)
 x<- replace_symbol(x)
 x<-tolower(x)
 return(x)
}

d) Apply the above functions

hillary_text <- sapply(hillary,convert_to_text)
hillary_text1 <- hillary_text
hill_remove_url<- replacefunc(hillary_text1)
hill_sub <- replace_dup(hill_remove_url)
hill_indx <- which(hill_sub=="")
hill_sub_complete <- hill_sub[-hill_indx]

trump_text <- sapply(trump,convert_to_text)
trump_text1 <- trump_text
trump_remove_url<- replacefunc(trump_text1)
trump_sub <- replace_dup(trump_remove_url)
trump_indx <- which(trump_sub=="")
trump_sub_complete <- trump_sub[-trump_indx]

# encode to UTF-8 : capable of encoding all possible characters defined by unicode
trump_sub_complete <- paste(trump_sub_complete,collapse=" ")
Encoding(trump_sub_complete) <- "UTF-8"
trump_sub_complete <- iconv(trump_sub_complete, "UTF-8", "UTF-8",sub='') 
                         #replace non UTF-8 by empty space
trump_clean <- clean_qdap(trump_sub_complete)
trump_clean1 <- trump_clean

hill_sub_complete <- paste(hill_sub_complete,collapse=" ")
Encoding(hill_sub_complete) <- "UTF-8"
hill_sub_complete <- iconv(hill_sub_complete, "UTF-8", "UTF-8",sub='') 
                     #replace non UTF-8 by empty space
hillary_clean <- clean_qdap(hill_sub_complete)
hillary_clean1 <- hillary_clean

 

Step 4: Convert the character vectors to VCorpus objects

trump_corpus <- VCorpus(VectorSource(trump_clean1))
hill_corpus <- VCorpus(VectorSource(hillary_clean1))

 

Step 5: Define and apply function to  format the corpus object 

clean_corpus <- function(corpus){
 corpus <- tm_map(corpus, removePunctuation)
 corpus <- tm_map(corpus, stripWhitespace)
 corpus <- tm_map(corpus, removeNumbers)
 corpus <- tm_map(corpus, content_transformer(tolower))
 corpus <- tm_map(corpus, removeWords, 
 c(stopwords("en"),"supporters","vote","election","like","even","get","will","can"
,"amp","still","just","will","now"))
 return(corpus)
}

trump_corpus_clean <- clean_corpus(trump_corpus)
hill_corpus_clean <- clean_corpus(hill_corpus)
  • Note : qdap cleaner functions can be used with character vectors, but tm functions need a corpus as input.

Step 6: Convert the corpora into TermDocumentMatrix(TDM) objects

Tdmobjecthillary <- TermDocumentMatrix(hill_corpus_clean1)
Tdmobjecttrump <- TermDocumentMatrix(trump_corpus_clean1)

Step 7: Convert the TDM objects into matrices

Tdmobjectmatrixhillary <- as.matrix(Tdmobjecthillary)
Tdmobjectmatrixtrump <- as.matrix(Tdmobjecttrump)

 

Step 8: Sum rows and create term-frequency dataframe

Freq <- rowSums(Tdmobjectmatrixhillary)
Word_freq <- data.frame(term= names(Freq),num=Freq)

Freqtrump <- rowSums(Tdmobjectmatrixtrump)
Word_freqtrump <- data.frame(term= names(Freqtrump),num=Freqtrump)

Step 9: Prep for fancier wordclouds

# unify the corpora
cc <- c(trump_corpus_clean,hill_corpus_clean)

# convert to TDM
all_tdm <- TermDocumentMatrix(cc)
colnames(all_tdm) <- c("Trump","Hillary")

# convert to matrix
all_m <- as.matrix(all_tdm)


# Create common_words
common_words <- subset(all_tdm_m, all_tdm_m[, 1] > 0 & all_tdm_m[, 2] > 0)

# Create difference
difference <- abs(common_words[, 1] - common_words[, 2])

# Combine common_words and difference
common_words <- cbind(common_words, difference)

# Order the data frame from most differences to least
common_words <- common_words[order(common_words[, 3], decreasing = TRUE), ]

# Create top25_df
top25_df <- data.frame(x = common_words[1:25, 1], 
 y = common_words[1:25, 2], 
 labels = rownames(common_words[1:25, ]))

Step 10: It’s word cloud time!

 

a) The ‘everyday’ cloud

wordcloud(Word_freq$term, Word_freq$num, scale=c(3,0.5),max.words=1000, 
          random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, 
          colors=brewer.pal(5, "Blues"))

wordcloud(Word_freqtrump$term, Word_freqtrump$num, scale=c(3,0.5),max.words=1000,
          random.order=FALSE, rot.per=0.35, use.r.layout=FALSE,
          colors=brewer.pal(5, "Reds"))

2016-11-09_22-19-43trumpnew

 

b) The Polarized pyramid plot

# Create the pyramid plot
pyramid.plot(top25_df$x, top25_df$y, labels = top25_df$labels, 
 gap = 70, top.labels = c("Trump", "Words", "Hillary"), 
 main = "Words in Common", laxlab = NULL, 
 raxlab = NULL, unit = NULL)

2016-11-11_19-55-31.jpg

 

c) The comparison cloud 

comparison.cloud(all_m, colors = c("red", "blue"),max.words=100)

2016-11-11_19-59-18.jpg

d) The commonality cloud 

commonality.cloud(all_m, colors = "steelblue1",max.words=100)

2016-11-09_22-19-43.jpg

 

We made it!  That’s it for this post, folks.

Coming up next: Mining deeper into text.

 

 

 

 

Advertisements

Top 16% : How to plateau using the ‘Feature Engineering’ approach.

Standard

Ladies and Gents! I am now placed in the top 16% of the leaderboard rankings with a score of .80383.

2016-10-14_10-08-27

 

I have also plateaued horribly. No matter what other features I try to ‘engineer’ my score just won’t budge. It get’s worse, sure, but never better. Bummer.

Everything pretty much remains the same as the previous post in terms of data reading and cleaning. In this post, let’s look at what I did differently.

This attempt was a departure from applying the algorithms as is and hoping for a better prediction (Admit it.We’re all guilty.) This time I incorporated the ‘human’ element – I even tried to recall scenes from the movie for that extra insight(Still unfair how Rose hogged the entire wooden plank).

Some of the theories I considered:

  • Women and children were given priority and evacuated first.
  • Mothers would look out for their children.
  • First class passengers were given priority over those in 2nd or 3rd class.
  • Women and children were probably given priority over males in every class.
  • Families travelling together probably had a better chance of survival since they’d try to stick together and help each other out.
  • Older people would have trouble evacuating and hence, would have lower odds of survival.

 

Also, this time around, I played around with the ‘Name’ and ‘Cabin’ variables and that made a huge diffference!

So what you need to do to plateau with an 80.4% prediction is as follows:

Identify the unique titles and create a new variable unique:

# check for all the unique titles 

unique <- gsub(".*?,\\s(.*?)\\..*$","\\1",dat$Name)

dat$unique<- unique
dat$unique[dat$unique %in% c("Mlle","Mme")] <-"Mlle"
dat$unique[dat$unique %in% c('Capt', 'Don', 'Major', 'Sir')] <- 'Sir'
dat$unique[dat$unique %in% c('Dona', 'Lady', 'the Countess','Jonkheer')] <- 'Lady'

table(dat$unique) # check the distribution of different titles

# passenger’s title 
dat$unique <- factor(dat$unique)

Identify the children and create a new variable isChild  :

dat$ischild <- factor(ifelse(dat$Age<=16,"Child","Adult"))

Identify the mothers and create a new variable isMother:

dat$isMother<- "Not Mother"
dat$isMother[dat$Sex=="female" & dat$Parch>0 & unique!="Miss"] <- "Mother"
dat$isMother<- factor(dat$isMother)

Uniquely identify the Cabins: This variable leads to somewhat of an overfit.

dat$Cabin <- substr(dat$Cabin,1,1)
dat$Cabin[dat$Cabin %in% c("F","G","T",NA)] <- "X"
dat$Cabin<- factor(dat$Cabin)

Compute the family size and create a new variable familysize :

dat$familysize <- dat$SibSp + dat$Parch + 1

Use the ‘familysize‘ variable and the surname of the passenger to designate the family size as “Small” or “Big” in the new variable unit :

pass_names <- dat$Name
extractsurname <- function(x){
  if(grepl(".*?,\\s.*?",x)){
  gsub("^(.*?),\\s.*?$","\\1",x)
 }
}

surnames <- vapply(pass_names, FUN=extractsurname,FUN.VALUE = character(1),USE.NAMES = F)
fam<-paste(as.character(dat$familysize),surnames,sep=" ")


famsize<- function(x){
 if(substr(x,1,2) > 2){
 
 x <- "Big"
 }else{
 x <- "Small"
 }
}

unit <- vapply(fam, FUN=famsize,FUN.VALUE = character(1),USE.NAMES = F)
dat$unit <- unit
dat$unit <- factor(dat$unit)

 

Split the ‘dat’ dataset into train and test (60 : 40 split) and fit the randomforest model.

n<- nrow(dat)
shuffled <- dat[sample(n),]

traindat <- shuffled[1:round(0.6*n),]
testdat<- shuffled[(round(0.6*n) + 1):n,]

dim(traindat)
dim(testdat)

require(caret)
require(ranger)
model <- train(
 Survived ~.,
 tuneLength = 50,
 data = traindat, method ="ranger",
 trControl = trainControl(method = "cv", number = 5, verboseIter = TRUE)
)

pred <- predict(model,newdata=testdat[,-2])
conf<- table(testdat$Survived,pred)
accuracy<- sum(diag(conf))/sum(conf)
accuracy

Using the model to predict survival (minus Cabin) gives us 83.14% accuracy on our test data’testdat’ and 80.34% on Kaggle.

Using the model to predict survival (with Cabin) gives us 83.71% accuracy on our test data’testdat’ which drops to around 79% on Kaggle.

Although, I still haven’t tinkered around with ‘Fare’, ‘Ticket’, and ‘Embarked’ (the urge to do so is strong), I think I’ll just leave it alone for the time being – but I will be revisiting for that elusive ‘eureka’ moment!

You can find the code here .