Skip to content

Commit

Permalink
allow usage of fixed=TRUE
Browse files Browse the repository at this point in the history
  • Loading branch information
jwijffels committed May 26, 2021
1 parent 1f24f15 commit c7557b6
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 6 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
## CHANGES IN udpipe VERSION 0.8.6

- Downloading models to paths containing non-ASCII characters now works (issue #95)
- strsplit.data.frame gains ... which are passed on to strsplit (e.g. to use fixed=TRUE for speeding up)
- read_connlu is now using fixed=TRUE when splitting by newline symbol (for speeding up)

## CHANGES IN udpipe VERSION 0.8.5

Expand Down
2 changes: 1 addition & 1 deletion R/udpipe_parse.R
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ read_connlu <- function(x, is_udpipe_annotation = FALSE, ...){
grepl(pattern = prefix, x = x)
}
}
txt <- strsplit(x$conllu, "\n")[[1]]
txt <- strsplit(x$conllu, "\n", fixed = TRUE)[[1]]
is_sentence_boundary <- txt == ""
is_comment <- startsWith(txt, "#")
is_newdoc <- startsWith(txt, "# newdoc")
Expand Down
12 changes: 8 additions & 4 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -693,6 +693,7 @@ paste.data.frame <- function(data, term, group, collapse=" "){
#' The text in \code{term} will be split into tokens by group.
#' @param split a regular expression indicating how to split the \code{term} column.
#' Defaults to splitting by spaces, punctuation symbols or digits. This will be passed on to \code{\link{strsplit}}.
#' @param ... further arguments passed on to \code{\link{strsplit}}
#' @return A tokenised data frame containing one row per token.\cr
#' This data.frame has the columns from \code{group} and \code{term} where the text in column \code{term}
#' will be split by the provided regular expression into tokens.
Expand All @@ -705,8 +706,11 @@ paste.data.frame <- function(data, term, group, collapse=" "){
#' x <- strsplit.data.frame(brussels_reviews,
#' term = c("feedback"),
#' group = c("listing_id", "language"))
#' head(x)
#' x <- strsplit.data.frame(brussels_reviews, term = "feedback", group = "id",
#' split = " ", fixed = TRUE)
#' head(x)
strsplit.data.frame <- function(data, term, group, split = "[[:space:][:punct:][:digit:]]+"){
strsplit.data.frame <- function(data, term, group, split = "[[:space:][:punct:][:digit:]]+", ...){
.SDcols <- .SD <- NULL
stopifnot(inherits(data, "data.frame"))
stopifnot(inherits(term, "character"))
Expand All @@ -719,13 +723,13 @@ strsplit.data.frame <- function(data, term, group, split = "[[:space:][:punct:][
}else{
data <- data.table::as.data.table(data[, c(term, group)])
}
x <- data[, lapply(.SD, FUN=function(txt){
terms <- unlist(strsplit(txt, split = split))
x <- data[, lapply(.SD, FUN=function(txt, ...){
terms <- unlist(strsplit(txt, split = split, ...))
terms <- as.character(terms)
terms <- terms[!is.na(terms)]
terms <- terms[nchar(terms) > 0]
terms
}), by = group, .SDcols = term]
}, ...), by = group, .SDcols = term]
x <- data.table::setDF(x)
x
}
Expand Down
8 changes: 7 additions & 1 deletion man/strsplit.data.frame.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit c7557b6

Please sign in to comment.