NA/NaN/Inf in foreign function call (arg 6)

11,419

I was stuck on the same issue. But I modified it my way to remove all the NA values. You can check my code and compare what might be the problem in your code.

#init
libs <- c("tm" , "plyr" , "class")
lapply(libs,require, character.only=TRUE)

#set options
options(stringsAsFactors = FALSE)

#set parameters

candidates <- c("user1" , "user2" ,"test")
pathname <- "C:/Users/prabhjot.rai/Documents/Project_r/textMining"

#clean text

cleanCorpus <- function(corpus)
{
  corpus.tmp <- tm_map(corpus, removePunctuation)
  corpus.tmp <- tm_map(corpus.tmp, stripWhitespace)
  corpus.tmp <- tm_map(corpus.tmp, content_transformer(tolower))
  corpus.tmp <- tm_map(corpus.tmp, removeWords, stopwords("english"))
  corpus.tmp <- tm_map(corpus.tmp, PlainTextDocument)
}

#build TDM

generateTDM <- function(cand,path)
{
  s.dir <- sprintf("%s/%s", path, cand)
  s.cor <- Corpus(DirSource(directory = s.dir))
  s.cor.cl <- cleanCorpus(s.cor)
  s.tdm <- TermDocumentMatrix(s.cor.cl)

  s.tdm <- removeSparseTerms(s.tdm, 0.7)
  result <- list(name = cand , tdm = s.tdm)
}



tdm <- lapply(candidates, generateTDM, path = pathname)


test <- t(data.matrix(tdm[[1]]$tdm))
rownames(test) <- c(1:nrow(test))

#attach name and convert to dataframe
makeMatrix <- function(thisTDM){

  test <- t(data.matrix(thisTDM$tdm))
  rownames(test) <- c(1:nrow(test))
  test <- as.data.frame(test, stringsAsFactors = F , na.rm = T)
  test$candidateName <- thisTDM$name
  test <- as.data.frame(test, stringsAsFactors = F , na.rm = T)
}

candTDM <- lapply(tdm, makeMatrix)

# stack all the speeches together

tdm.stack <- do.call(rbind.fill, candTDM)
tdm.stack[is.na(tdm.stack)] <- as.numeric(0)

#testing and training sets
train <- tdm.stack[ tdm.stack$candidateName!= 'test' ,  ]
train <- train[, names(train) != 'candidateName']
test <- tdm.stack[ tdm.stack$candidateName == 'test' , ]
test <- test[, names(test) != 'candidateName']
classes <- tdm.stack [ tdm.stack$candidateName != 'test' , 'candidateName']
classes <- as.factor(classes)

myknn <- knn(train=train, test = test , cl = classes , k=1)
myknn

Keep a testing file in the test folder next to user1 and user2 folders to check the output of this algorithm. And keep the value of k as the square root of number of speeches, preferably an odd number. And ignore the redundancy of testing and training set assignment. It was not working in one line in my machine so did it in two lines.

Share:
11,419

Related videos on Youtube

user2647221
Author by

user2647221

Updated on June 04, 2022

Comments

  • user2647221
    user2647221 almost 2 years

    I am doing a term paper in Text mining using R. Our task is to guess the tone of an article (positive/negative). The articles are stored in respective folders. I need to create a classification system which will learn through training samples. I reused the code from http://www.youtube.com/watch?v=j1V2McKbkLo The entire code except the last line got executed successfully. Following is the code.

    tone<- c("Positive", "Negative")
    folderpath <- "C:/Users/Tanmay/Desktop/R practice/Week8"
    
    options(stringAsFactors = FALSE)
    
    corpus<-Corpus(DirSource(folderpath))
    corpuscopy<-corpus
    summary(corpus)
    inspect(corpus)
    
    #Clean data
    CleanCorpus <- function(corpus){
    
      corpustemp <- tm_map(corpus, removeNumbers)
      corpustemp <- tm_map(corpus, removePunctuation)
      corpustemp <- tm_map(corpus, tolower)
      corpustemp <- tm_map(corpus, removeWords, stopwords("english"))
      corpustemp <- tm_map(corpus, stemDocument,language="english")
      corpustemp <- tm_map(corpus, stripWhitespace)
    
      return(corpustemp )
    }
    
    
    #Document term matrix
    generateTDM <- function(tone,path) {
    
      corpusdir <- sprintf("%s/%s",path,tone)
      corpus<- Corpus(DirSource( directory=corpusdir ,encoding = "ANSI"))
      corpustemp <- CleanCorpus(corpus)
      corpusclean <- DocumentTermMatrix(corpustemp)
      corpusclean <- removeSparseTerms(corpusclean , 0.7)
      result <- list(Tone = tone, tdm = corpusclean) 
    }
    
    tdm <- lapply(tone,generateTDM,path=folderpath)
    
    #Attach tone
    ToneBindTotdm <- function(tdm){
      temp.mat <- data.matrix(tdm[["tdm"]])
      temp.df <- as.data.frame(temp.mat)
      temp.df <- cbind(temp.df,rep(tdm[["Tone"]]),nrow(temp.df))
      colnames(temp.df)[ncol(temp.df)] <- "PredictTone"
      return(temp.df)
    }
    Tonetdm <- lapply(tdm,ToneBindTotdm)
    
    
    #Stack
    Stacktdm <- do.call(rbind.fill,Tonetdm)
    Stacktdm[is.na(Stacktdm)] <- 0
    
    
    #Holdout
    
    trainid <- sample(nrow(Stacktdm),ceiling(nrow(Stacktdm) * 0.7))
    testid <- (1:nrow(Stacktdm)) [- trainid]
    
    #knn
    tdmone <- Stacktdm[,"PredictTone"]
    tdmone.nl <- Stacktdm[, !colnames(Stacktdm) %in% "PredictTone"]
    
    knnPredict <- knn(tdmone.nl[trainid,],tdmone.nl[testid,],tdmone[trainid],k=5)
    

    When I tried to execute this, I got error in the last line (knn) :

    **Error in knn(tdmone.nl[trainid, ], tdmone.nl[testid, ], tdmone[trainid],  : 
      NA/NaN/Inf in foreign function call (arg 6)
    In addition: Warning messages:
    1: In knn(tdmone.nl[trainid, ], tdmone.nl[testid, ], tdmone[trainid],  :
      NAs introduced by coercion
    2: In knn(tdmone.nl[trainid, ], tdmone.nl[testid, ], tdmone[trainid],  :
      NAs introduced by coercion**
    

    Could anyone please help me out. Also if there are other simpler and better way to classify please point me to them. Thanks and sorry for the long post.