forked from vanatteveldt/frogr
-
Notifications
You must be signed in to change notification settings - Fork 0
/
frog.r
89 lines (81 loc) · 3.24 KB
/
frog.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
library(zoo)
library(Matrix)
library(tm)
library(plyr)
#' Call a frog (Dutch lemmatizer and dependency parser) instance running in daemon mode
#'
#' See http://ilk.uvt.nl/frog/
#' To install frog and run as daemon (assuming debian/ubuntu), run:
#' $ sudo apt-get install frog frogdata ucto
#' $ frog -S 9772
#'
#' A separate call to frog is made for each text in the text input vector.
#'
#' Note that if something is wrong, it is quite possible that this function will totall hang
#' your R session as it is waiting for output on the socket, so use with caution!
#'
#' @param text: The text(s) to parse
#' @param host: The hostname for the frog server
#' @param port: The port the frog server is listening on
#' @param verbose: If true, output a message for each document
#' @return a data frame of tokens with columns for lemma, pos, etc
#' @export
call_frog <- function(text, host="localhost", port=9772, verbose=T) {
# establish connection and add finalizing code
socket <- make.socket(host, port)
on.exit(close.socket(socket))
# call frog, ending with EOT
result <- vector("list", length(text))
for (i in 1:length(text)) {
t = text[i]
if(nchar(t) == 0) next
if (verbose) message("Frogging document ",i,": ", nchar(t), " characters")
tokens = do_call_frog(socket, t)
tokens$docid = i
result[[i]] = tokens
}
result = rbind.fill(result)
result[, c(ncol(result), ncol(result)-1, 1:(ncol(result)-2))]
}
#' Do the actual call to frog, returning the data frame
do_call_frog <- function(socket, text) {
write.socket(socket, text)
write.socket(socket, "\nEOT\n")
# read until 'READY' is found
output <- ""
while (!grepl("\nREADY\n$", output)) {
output = paste(output, read.socket(socket), sep="")
}
output = gsub("READY\n$", "", output)
# read output and label columns
con <- textConnection(output)
result = read.csv(con, sep='\t', quote = '', header=F)
close(con)
colnames(result) <- c("position", "word", "lemma", "morph", "pos", "prob",
"ner", "chunk", "parse1", "parse2")
result$majorpos = gsub("\\(.*", "", result$pos)
# assign sentence number by assigning number when position == 1 and filling down into NA cells using zoo::na.locf
result$sent[result$position == 1] = 1:sum(result$position == 1)
if(is.na(result$sent[1])) result$sent[1] = 1
result$sent = na.locf(result$sent)
result
}
#' Create a document term matrix from a token list
#'
#' @param docs: a vector that identifies to which document a token belongs
#' @param terms: a vector of terms of length equal to docs
#' @param freqs: an optional vector giving the frequency of each term
#' @param weighting: the optional weighting for tm (default: term frequency)
#' @return an object of type DocumentTermMatrix (from the tm package)
#' @export
create_dtm <- function(docs, terms, freqs=rep(1, length(terms)), weighting=weightTf) {
d = data.frame(doc=docs, term=terms, freq=freqs)
d = aggregate(freq ~ doc + term, d, FUN='sum')
docnames = unique(d$doc)
termnames = unique(d$term)
sm = spMatrix(nrow=length(docnames), ncol=length(termnames),
match(d$doc, docnames), match(d$term, termnames), d$freq)
rownames(sm) = docnames
colnames(sm) = termnames
as.DocumentTermMatrix(sm, weighting=weighting)
}