Sentence completion

This is an interesting application of natural language processing. Sentence auto-completion is an interesting feature that is shockingly absent in our modern-day browsers and mobile interfaces. Getting grammatically and contextually relevant suggestions as to what to type next, while we are typing a few words, would be such a great feature to have.

Coursera, in one of the data science courses by Johns Hopkins, provided four compressed datasets that contain terms and frequencies of unigram, bigram, trigram, and 4-gram in four datasets. The problem at hand was to come up with a model that can learn to predict relevant words to type next.

The following code uses the Katz-Backoff algorithm, leveraging the four n-gram term frequency datasets to predict the next word in a sentence:

library(tm)
library(stringr)
# Load the n-gram data
load("/data_frame1.RData");
load("/data_frame2.RData");
load("/data_frame3.RData");
load("/data_frame4.RData");
CleanInputString<- function(input_string)
{
  # cleaning up data
input_string<- iconv(input_string, "latin1", "ASCII", sub=" ");
input_string<- gsub("[^[:alpha:][:space:][:punct:]]", "", input_string);
  # corpus
input_corpus<- VCorpus(VectorSource(input_string))
input_corpus<- tm_map(input_corpus, content_transformer(tolower))
input_corpus<- tm_map(input_corpus, removePunctuation)
input_corpus<- tm_map(input_corpus, removeNumbers)
input_corpus<- tm_map(input_corpus, stripWhitespace)
input_string<- as.character(input_corpus[[1]])
input_string<- gsub("(^[[:space:]]+|[[:space:]]+$)", "", input_string)  
if (nchar(input_string) > 0) {
return(input_string); 
  } else {
return("");
  }
}
Get_next_word<- function(input_string)
{
   # Data cleansing using the function written earlier
input_string<- CleanInputString(input_string);
  # extract the string length
input_string<- unlist(strsplit(input_string, split=" "));
input_stringLen<- length(input_string);
next_word_present<- FALSE;
term_next<- as.character(NULL);
# Katz- backoffN-gram model
if (input_stringLen>= 3 & !next_word_present)
  {
    # collate the terms
    input_string1 <- paste(input_string[(input_stringLen-2):input_stringLen], collapse=" ");
    # take the subset of 4-gram data
searchStr<- paste("^",input_string1, sep = "");
    data_frame4Temp <- data_frame4[grep (searchStr, data_frame4$terms), ];
if ( length(data_frame4Temp[,1]) > 1 )
    {
term_next<- data_frame4Temp[1:10,1];# select 10 matching terms
next_word_present<- TRUE;
    }
    data_frame4Temp <- NULL;
  }
  # 2. lets go to n-1 gram
if (input_stringLen>= 2 & !next_word_present)
  {
    # collate input terms
    input_string1 <- paste(input_string[(input_stringLen-1):input_stringLen], collapse=" ");
searchStr<- paste("^",input_string1, sep = "");
    data_frame3Temp <- data_frame3[grep (searchStr, data_frame3$terms), ];
if ( length(data_frame3Temp[, 1]) > 1 )
    {
term_next<- data_frame3Temp[1:10,1];
next_word_present<- TRUE;
    }
    data_frame3Temp <- NULL;
  }
if (input_stringLen>= 1 & !next_word_present)
  {
    input_string1 <- input_string[input_stringLen];
searchStr<- paste("^",input_string1, sep = "");
    data_frame2Temp <- data_frame2[grep (searchStr, data_frame2$terms), ];
if ( length(data_frame2Temp[, 1]) > 1 )
    {
term_next<- data_frame2Temp[1:10,1];
next_word_present<- TRUE;
    }
    data_frame2Temp <- NULL;
  }  
if (!next_word_present&input_stringLen> 0)
  {
term_next<- data_frame1$terms[1,1];
  }
word_nxt<- word(term_next, -1);
if (input_stringLen> 0){
df<- data.frame(word_nxt);
return(df);
  } else {
word_nxt<- "";
df<- data.frame(word_nxt);
return(df);
  }
}

Let's try out the function for an incomplete sentence:

Get_next_word("I AM")
word_nxt
1        so
2         a
3       not
4     going
5        in
6      sure
7       the
8     still
9      just
10      now
Get_next_word("I AM SO")
word_nxt
1   excited
2     happy
3      glad
4  thankful
5     proud
6   blessed
7     tired
8        so
9   jealous
10     very
..................Content has been hidden....................

You can't read the all page of ebook, please click here login for view all page.
Reset