前文提到只要为tm创建一个包含分词功能的reader就可以使用tm来对中文进行处理了。现在有了jiebaR分词, 那么做这件事情就完全没有障碍了。
中文文本文件读入
根据这篇文章中的reader模板,并结合jiebaR 的文档,我们可以做下面这样一个reader
library(tm)
#> Loading required package: NLP
library(jiebaR)
#> Loading required package: jiebaRD
mixseg = worker()
cnreader<-function(elem,language,id)
{
#进行分词
words <- mixseg[elem$content]
#合并分词的结果为新的文档,该文档可为tm正确的识别
ncon <- paste(words,collapse=" ")
PlainTextDocument(ncon, id=id,language=language)
}
下面我们为当前目录下所有的md
文档建一个文集,并查看第一篇文章的代码:
mcop = VCorpus(DirSource(".",pattern="a*md"),
readerControl = list(reader=cnreader))
mcop
#> <<VCorpus>>
#> Metadata: corpus specific: 0, document level (indexed): 0
#> Content: documents: 714
mcop[[1]]
#> <<PlainTextDocument>>
#> Metadata: 7
#> Content: chars: 21
as.character(mcop[[1]])
#> [1] "bash echo hello world"
word文档和pdf文档的读入
前面的reader可以读入但前目录下的所有文本文件,并实现分词处理,一个更加实用的使用场景时读入word文档和pdf文档。
在介绍这两个Reader之前,我们先看看DirSource的elem中到底有些什么内容
#含有getElem的类
methods(getElem)
#> [1] getElem.DataframeSource* getElem.DirSource*
#> [3] getElem.URISource* getElem.VectorSource*
#> [5] getElem.XMLSource* getElem.ZipSource*
#> see '?methods' for accessing help and source code
#DirSource的所有方法
methods(class="DirSource")
#> [1] getElem pGetElem
#> see '?methods' for accessing help and source code
getAnywhere(getElem.DirSource)
#> A single object matching 'getElem.DirSource' was found
#> It was found in the following places
#> registered S3 method for getElem from namespace tm
#> namespace:tm
#> with value
#>
#> function (x)
#> {
#> filename <- x$filelist[x$position]
#> list(content = readContent(filename, x$encoding, x$mode),
#> uri = paste0("file://", filename))
#> }
#> <environment: namespace:tm>
然后看看readDOC/readPDF是怎样的?
readDOC
#> function (AntiwordOptions = "")
#> {
#> stopifnot(is.character(AntiwordOptions))
#> function(elem, language, id) {
#> uri <- processURI(elem$uri)
#> content <- system2("antiword", c(AntiwordOptions, shQuote(normalizePath(uri))),
#> stdout = TRUE)
#> PlainTextDocument(content, id = basename(elem$uri), language = language)
#> }
#> }
#> <environment: namespace:tm>
#> attr(,"class")
#> [1] "FunctionGenerator" "function"
readPDF
#> function (engine = c("xpdf", "Rpoppler", "ghostscript", "Rcampdf",
#> "custom"), control = list(info = NULL, text = NULL))
#> {
#> stopifnot(is.character(engine), is.list(control))
#> engine <- match.arg(engine)
#> pdf_info <- switch(engine, xpdf = function(x) pdf_info_via_xpdf(x,
#> control$info), Rpoppler = Rpoppler::PDF_info, ghostscript = pdf_info_via_gs,
#> Rcampdf = Rcampdf::pdf_info, custom = control$info)
#> pdf_text <- switch(engine, xpdf = function(x) system2("pdftotext",
#> c(control$text, shQuote(x), "-"), stdout = TRUE), Rpoppler = Rpoppler::PDF_text,
#> ghostscript = pdf_text_via_gs, Rcampdf = Rcampdf::pdf_text,
#> custom = control$text)
#> if (!is.function(pdf_info) || !is.function(pdf_text))
#> stop("invalid function for PDF extraction")
#> function(elem, language, id) {
#> uri <- processURI(elem$uri)
#> meta <- pdf_info(uri)
#> content <- pdf_text(uri)
#> PlainTextDocument(content, meta$Author, meta$CreationDate,
#> meta$Subject, meta$Title, basename(elem$uri), language,
#> meta$Creator)
#> }
#> }
#> <environment: namespace:tm>
#> attr(,"class")
#> [1] "FunctionGenerator" "function"
下面就试试这样一个reader
myreadDOC <- readDOC("-t")
myreadPDF <- readPDF("xpdf")
word_pdf_reader<-function(elem,language,id)
{
#判断是否doc / pdf, 并调用相应的解析器
tp=grep("pdf$",elem$uri,ignore.case=TRUE)
if(length(tp) >= 1){
print(paste("processing pdf file",elem$uri))
con=myreadPDF(elem,language,id)
}
tp=grep("docx$",elem$uri,ignore.case=TRUE)
if(length(tp) >= 1){
print(paste("processing docx file",elem$uri))
con=myreadDOC(elem,language,id)
}
words <- mixseg[con$content]
#合并分词的结果为新的文档,该文档可为tm正确的识别
ncon <- paste(words,collapse=" ")
con$content = ncon
con
}
建立文集的过程
mcop = VCorpus(DirSource(".",pattern="*docx"),
readerControl = list(reader=word_pdf_reader))
mcop
mcop[[1]]
as.character(mcop[[1]])
注意这里的antiword只能处理MS word 2003及以前的word格式。后来docx并不能很好的支持。此处的转换应该由pandoc来完成。
从结果看,基本上实现了想要的结果,接下来就可以进行进一步的分析了。
本文地址: http://www.bagualu.net/wordpress/archives/6124 转载请注明
微博帐号评论