NA/NaN/Inf in foreign function call (arg 6)

r text-mining knn text-classification

11,419

I was stuck on the same issue. But I modified it my way to remove all the NA values. You can check my code and compare what might be the problem in your code.

#init
libs <- c("tm" , "plyr" , "class")
lapply(libs,require, character.only=TRUE)

#set options
options(stringsAsFactors = FALSE)

#set parameters

candidates <- c("user1" , "user2" ,"test")
pathname <- "C:/Users/prabhjot.rai/Documents/Project_r/textMining"

#clean text

cleanCorpus <- function(corpus)
{
  corpus.tmp <- tm_map(corpus, removePunctuation)
  corpus.tmp <- tm_map(corpus.tmp, stripWhitespace)
  corpus.tmp <- tm_map(corpus.tmp, content_transformer(tolower))
  corpus.tmp <- tm_map(corpus.tmp, removeWords, stopwords("english"))
  corpus.tmp <- tm_map(corpus.tmp, PlainTextDocument)
}

#build TDM

generateTDM <- function(cand,path)
{
  s.dir <- sprintf("%s/%s", path, cand)
  s.cor <- Corpus(DirSource(directory = s.dir))
  s.cor.cl <- cleanCorpus(s.cor)
  s.tdm <- TermDocumentMatrix(s.cor.cl)

  s.tdm <- removeSparseTerms(s.tdm, 0.7)
  result <- list(name = cand , tdm = s.tdm)
}



tdm <- lapply(candidates, generateTDM, path = pathname)


test <- t(data.matrix(tdm[[1]]$tdm))
rownames(test) <- c(1:nrow(test))

#attach name and convert to dataframe
makeMatrix <- function(thisTDM){

  test <- t(data.matrix(thisTDM$tdm))
  rownames(test) <- c(1:nrow(test))
  test <- as.data.frame(test, stringsAsFactors = F , na.rm = T)
  test$candidateName <- thisTDM$name
  test <- as.data.frame(test, stringsAsFactors = F , na.rm = T)
}

candTDM <- lapply(tdm, makeMatrix)

# stack all the speeches together

tdm.stack <- do.call(rbind.fill, candTDM)
tdm.stack[is.na(tdm.stack)] <- as.numeric(0)

#testing and training sets
train <- tdm.stack[ tdm.stack$candidateName!= 'test' ,  ]
train <- train[, names(train) != 'candidateName']
test <- tdm.stack[ tdm.stack$candidateName == 'test' , ]
test <- test[, names(test) != 'candidateName']
classes <- tdm.stack [ tdm.stack$candidateName != 'test' , 'candidateName']
classes <- as.factor(classes)

myknn <- knn(train=train, test = test , cl = classes , k=1)
myknn

Keep a testing file in the test folder next to user1 and user2 folders to check the output of this algorithm. And keep the value of k as the square root of number of speeches, preferably an odd number. And ignore the redundancy of testing and training set assignment. It was not working in one line in my machine so did it in two lines.

11,419

user2647221

Updated on June 04, 2022

Comments

user2647221 almost 2 years

I am doing a term paper in Text mining using R. Our task is to guess the tone of an article (positive/negative). The articles are stored in respective folders. I need to create a classification system which will learn through training samples. I reused the code from http://www.youtube.com/watch?v=j1V2McKbkLo The entire code except the last line got executed successfully. Following is the code.

tone<- c("Positive", "Negative")
folderpath <- "C:/Users/Tanmay/Desktop/R practice/Week8"

options(stringAsFactors = FALSE)

corpus<-Corpus(DirSource(folderpath))
corpuscopy<-corpus
summary(corpus)
inspect(corpus)

#Clean data
CleanCorpus <- function(corpus){

  corpustemp <- tm_map(corpus, removeNumbers)
  corpustemp <- tm_map(corpus, removePunctuation)
  corpustemp <- tm_map(corpus, tolower)
  corpustemp <- tm_map(corpus, removeWords, stopwords("english"))
  corpustemp <- tm_map(corpus, stemDocument,language="english")
  corpustemp <- tm_map(corpus, stripWhitespace)

  return(corpustemp )
}


#Document term matrix
generateTDM <- function(tone,path) {

  corpusdir <- sprintf("%s/%s",path,tone)
  corpus<- Corpus(DirSource( directory=corpusdir ,encoding = "ANSI"))
  corpustemp <- CleanCorpus(corpus)
  corpusclean <- DocumentTermMatrix(corpustemp)
  corpusclean <- removeSparseTerms(corpusclean , 0.7)
  result <- list(Tone = tone, tdm = corpusclean) 
}

tdm <- lapply(tone,generateTDM,path=folderpath)

#Attach tone
ToneBindTotdm <- function(tdm){
  temp.mat <- data.matrix(tdm[["tdm"]])
  temp.df <- as.data.frame(temp.mat)
  temp.df <- cbind(temp.df,rep(tdm[["Tone"]]),nrow(temp.df))
  colnames(temp.df)[ncol(temp.df)] <- "PredictTone"
  return(temp.df)
}
Tonetdm <- lapply(tdm,ToneBindTotdm)


#Stack
Stacktdm <- do.call(rbind.fill,Tonetdm)
Stacktdm[is.na(Stacktdm)] <- 0


#Holdout

trainid <- sample(nrow(Stacktdm),ceiling(nrow(Stacktdm) * 0.7))
testid <- (1:nrow(Stacktdm)) [- trainid]

#knn
tdmone <- Stacktdm[,"PredictTone"]
tdmone.nl <- Stacktdm[, !colnames(Stacktdm) %in% "PredictTone"]

knnPredict <- knn(tdmone.nl[trainid,],tdmone.nl[testid,],tdmone[trainid],k=5)

When I tried to execute this, I got error in the last line (knn) :

**Error in knn(tdmone.nl[trainid, ], tdmone.nl[testid, ], tdmone[trainid],  : 
  NA/NaN/Inf in foreign function call (arg 6)
In addition: Warning messages:
1: In knn(tdmone.nl[trainid, ], tdmone.nl[testid, ], tdmone[trainid],  :
  NAs introduced by coercion
2: In knn(tdmone.nl[trainid, ], tdmone.nl[testid, ], tdmone[trainid],  :
  NAs introduced by coercion**

Could anyone please help me out. Also if there are other simpler and better way to classify please point me to them. Thanks and sorry for the long post.