allow usage of fixed=TRUE

bnosac · May 26, 2021 · c7557b6 · c7557b6
1 parent 1f24f15
commit c7557b6
Show file tree

Hide file tree

Showing 4 changed files with 18 additions and 6 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,8 @@
 ## CHANGES IN udpipe VERSION 0.8.6
 
 - Downloading models to paths containing non-ASCII characters now works (issue #95)
+- strsplit.data.frame gains ... which are passed on to strsplit (e.g. to use fixed=TRUE for speeding up)
+- read_connlu is now using fixed=TRUE when splitting by newline symbol (for speeding up)
 
 ## CHANGES IN udpipe VERSION 0.8.5
 

diff --git a/R/udpipe_parse.R b/R/udpipe_parse.R
@@ -251,7 +251,7 @@ read_connlu <- function(x, is_udpipe_annotation = FALSE, ...){
       grepl(pattern = prefix, x = x)
     }
   }
-  txt <- strsplit(x$conllu, "\n")[[1]]
+  txt <- strsplit(x$conllu, "\n", fixed = TRUE)[[1]]
   is_sentence_boundary <- txt == ""
   is_comment <- startsWith(txt, "#")
   is_newdoc <- startsWith(txt, "# newdoc")

diff --git a/R/utils.R b/R/utils.R
@@ -693,6 +693,7 @@ paste.data.frame <- function(data, term, group, collapse=" "){
 #' The text in \code{term} will be split into tokens by group.
 #' @param split a regular expression indicating how to split the \code{term} column. 
 #' Defaults to splitting by spaces, punctuation symbols or digits. This will be passed on to \code{\link{strsplit}}.
+#' @param ... further arguments passed on to \code{\link{strsplit}}
 #' @return A tokenised data frame containing one row per token.\cr
 #' This data.frame has the columns from \code{group} and \code{term} where the text in column \code{term}
 #' will be split by the provided regular expression into tokens. 
@@ -705,8 +706,11 @@ paste.data.frame <- function(data, term, group, collapse=" "){
 #' x <- strsplit.data.frame(brussels_reviews, 
 #'                          term = c("feedback"), 
 #'                          group = c("listing_id", "language"))
+#' head(x)  
+#' x <- strsplit.data.frame(brussels_reviews, term = "feedback", group = "id", 
+#'                          split = " ", fixed = TRUE)
 #' head(x)                          
-strsplit.data.frame <- function(data, term, group, split = "[[:space:][:punct:][:digit:]]+"){
+strsplit.data.frame <- function(data, term, group, split = "[[:space:][:punct:][:digit:]]+", ...){
   .SDcols <- .SD <- NULL
   stopifnot(inherits(data, "data.frame"))
   stopifnot(inherits(term, "character"))
@@ -719,13 +723,13 @@ strsplit.data.frame <- function(data, term, group, split = "[[:space:][:punct:][
   }else{
     data <- data.table::as.data.table(data[, c(term, group)])  
   }
-  x <- data[, lapply(.SD, FUN=function(txt){
-    terms <- unlist(strsplit(txt, split = split))
+  x <- data[, lapply(.SD, FUN=function(txt, ...){
+    terms <- unlist(strsplit(txt, split = split, ...))
     terms <- as.character(terms)
     terms <- terms[!is.na(terms)]
     terms <- terms[nchar(terms) > 0]
     terms
-  }), by = group, .SDcols = term]
+  }, ...), by = group, .SDcols = term]
   x <- data.table::setDF(x)
   x
 }

diff --git a/man/strsplit.data.frame.Rd b/man/strsplit.data.frame.Rd