Skip to content

Commit

Permalink
Add txt_paste and txt_context
Browse files Browse the repository at this point in the history
  • Loading branch information
jwijffels committed May 27, 2021
1 parent e66232b commit 648e469
Show file tree
Hide file tree
Showing 5 changed files with 214 additions and 2 deletions.
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,14 @@ export(phrases)
export(strsplit.data.frame)
export(txt_collapse)
export(txt_contains)
export(txt_context)
export(txt_count)
export(txt_freq)
export(txt_highlight)
export(txt_next)
export(txt_nextgram)
export(txt_overlap)
export(txt_paste)
export(txt_previous)
export(txt_previousgram)
export(txt_recode)
Expand Down
4 changes: 3 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

- Downloading models to paths containing non-ASCII characters now works (issue #95)
- strsplit.data.frame gains ... which are passed on to strsplit (e.g. to use fixed=TRUE for speeding up)
- read_connlu is now using fixed=TRUE when splitting by newline symbol (for speeding up)
- read_connlu is now using fixed=TRUE when splitting by newline symbol (for speeding up parsing with function udpipe)
- Added txt_paste
- Added txt_context

## CHANGES IN udpipe VERSION 0.8.5

Expand Down
106 changes: 105 additions & 1 deletion R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#' x <- list(a = c("h", "i"), b = c("some", "more", "text"),
#' c = character(), d = NA)
#' txt_collapse(x, collapse = " ")
txt_collapse <- function(x, collapse=" "){
txt_collapse <- function(x, collapse = " "){
if(!is.list(x)){
x <- as.character(x)
x <- x[!is.na(x)]
Expand All @@ -35,6 +35,49 @@ txt_collapse <- function(x, collapse=" "){
x
}

#' @title Concatenate strings with options how to handle missing data
#' @description NA friendly version for concatenating string
#' @param ... character vectors
#' @param collapse a character string to be used to paste the vectors together. Defaults to a space: ' '.
#' @param na.rm logical, if set to \code{TRUE}, will replace NA with ''. If set to \code{FALSE}, will have a resulting value of NA
#' if at least one element is \code{NA}, in a similar spirit as \code{mean}. Defaults to \code{FALSE}.
#' @return a character vector
#' @export
#' @seealso \code{\link{paste}}
#' @examples
#' x <- c(1, 2, 3, NA, NA)
#' y <- c("a", "b", "c", NA, "OK")
#' paste(x, y, sep = "-")
#' txt_paste(x, y, collapse = "-", na.rm = TRUE)
#' txt_paste(x, y, collapse = "-", na.rm = FALSE)
#'
#' x <- c(NA, "a", "b")
#' y <- c("1", "2", NA)
#' z <- c("-", "*", NA)
#' txt_paste(x, y, z, collapse = "", na.rm = TRUE)
#' txt_paste(x, y, z, "_____", collapse = "", na.rm = TRUE)
#' txt_paste(x, y, z, "_____", collapse = "", na.rm = FALSE)
txt_paste <- function(..., collapse = " ", na.rm = FALSE){
x <- data.frame(list(...), stringsAsFactors = FALSE)
if(na.rm){
apply(x, MARGIN = 1, FUN = function(x){
x <- x[!is.na(x)]
if(length(x) == 0){
return(NA_character_)
}
paste(x, collapse = collapse)
})
}else{
apply(x, MARGIN = 1, FUN = function(x){
if(anyNA(x)){
return(NA_character_)
}
paste(x[!is.na(x)], collapse = collapse)
})
}
}





Expand Down Expand Up @@ -221,6 +264,67 @@ txt_previousgram <- function(x, n = 2, sep = " "){
out
}

#' @title Based on a vector with a word sequence, get n-grams (looking forward + backward)
#' @description If you have annotated your text using \code{\link{udpipe_annotate}},
#' your text is tokenised in a sequence of words. Based on this vector of words in sequence
#' getting n-grams comes down to looking at the previous/next word and the subsequent previous/next word andsoforth.
#' These words can be \code{pasted} together to form an n-gram.
#' @param x a character vector where each element is just 1 term or word
#' @param n an integer vector indicating how many terms to look back and ahead
#' @param sep a character element indicating how to \code{\link{paste}} the subsequent words together
#' @param na.rm logical, if set to \code{TRUE}, will keep all text even if it can not look back/ahead the amount specified by \code{n}.
#' If set to \code{FALSE}, will have a resulting value of \code{NA}
#' if at least one element is \code{NA} or it can not look back/ahead the amount specified by \code{n}.
#' @return a character vector of the same length of \code{x} with the n-grams
#' @seealso \code{\link{txt_paste}}, \code{\link{txt_next}}, \code{\link{txt_previous}}, \code{\link[data.table]{shift}}
#' @export
#' @examples
#' x <- c("We", "walked", "anxiously", "to", "the", "doctor", "!")
#'
#' ## Look 1 word before + word itself
#' y <- txt_context(x, n = c(-1, 0), na.rm = FALSE)
#' data.frame(x, y)
#' ## Look 1 word before + word itself + 1 word after
#' y <- txt_context(x, n = c(-1, 0, 1), na.rm = FALSE)
#' data.frame(x, y)
#' y <- txt_context(x, n = c(-1, 0, 1), na.rm = TRUE)
#' data.frame(x, y)
#'
#' ## Look 2 words before + word itself + 1 word after
#' ## even if not all words are there
#' y <- txt_context(x, n = c(-2, -1, 0, 1), na.rm = TRUE, sep = "_")
#' data.frame(x, y)
#' y <- txt_context(x, n = c(-2, -1, 1, 2), na.rm = FALSE, sep = "_")
#' data.frame(x, y)
#'
#' x <- c("We", NA, NA, "to", "the", "doctor", "!")
#' y <- txt_context(x, n = c(-1, 0), na.rm = FALSE)
#' data.frame(x, y)
#' y <- txt_context(x, n = c(-1, 0), na.rm = TRUE)
#' data.frame(x, y)
#'
#' library(data.table)
#' data(brussels_reviews_anno, package = "udpipe")
#' x <- as.data.table(brussels_reviews_anno)
#' x <- x[, context := txt_context(lemma), by = list(doc_id, sentence_id)]
#' head(x, 20)
#' x$term <- sprintf("%s/%s", x$lemma, x$upos)
#' x <- x[, context := txt_context(term), by = list(doc_id, sentence_id)]
#' head(x, 20)
txt_context <- function(x, n = c(-1, 0, 1), sep = " ", na.rm = FALSE){
context <- lapply(n, FUN=function(i){
if(i >= 0){
out <- txt_next(x, n = i)
}else{
out <- txt_previous(x, n = -i)
}
out
})
context$collapse <- sep
context$na.rm <- na.rm
do.call(txt_paste, context)
}

#' @title Get the n-th previous element of a vector
#' @description Get the n-th previous element of a vector
#' @param x a character vector where each element is just 1 term or word
Expand Down
65 changes: 65 additions & 0 deletions man/txt_context.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

39 changes: 39 additions & 0 deletions man/txt_paste.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 648e469

Please sign in to comment.