Skip to content

Commit

Permalink
avoid word2vec dependency + add warning if terminology is provided wi…
Browse files Browse the repository at this point in the history
…th no embedding space
  • Loading branch information
jwijffels committed Jul 8, 2021
1 parent 85a2b71 commit 93cba4a
Show file tree
Hide file tree
Showing 5 changed files with 56 additions and 27 deletions.
12 changes: 12 additions & 0 deletions R/datasets.R
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,15 @@ NULL
#' str(brussels_reviews_anno)
#' }
NULL



#' @title An example matrix of word embeddings
#' @description An simple 10-dimensional example matrix of word embeddings trained on the Dutch lemma's
#' of the dataset \code{\link{brussels_reviews_anno}}
#' @name brussels_reviews_w2v_embeddings_lemma_nl
#' @docType data
#' @examples
#' data(brussels_reviews_w2v_embeddings_lemma_nl)
#' head(brussels_reviews_w2v_embeddings_lemma_nl)
NULL
32 changes: 18 additions & 14 deletions R/nlp_flow.R
Original file line number Diff line number Diff line change
Expand Up @@ -1292,34 +1292,33 @@ dtm_sample <- function(dtm, size = nrow(dtm), replace = FALSE, prob = NULL){
#' text(scores$terminology$similarity_weight, log(scores$terminology$freq),
#' labels = scores$terminology$term)
#'
#' \dontshow{
#' if(FALSE && require(word2vec))
#' \{
#' }
#' \dontrun{
#' ## More elaborate example using word2vec
#' ## building word2vec model on all Dutch texts,
#' ## finding similarity of dtm to adjectives only
#' set.seed(123)
#' library(word2vec)
#' text <- subset(brussels_reviews_anno, language == "nl")
#' text <- paste.data.frame(text, term = "lemma", group = "doc_id")
#' text <- text$lemma
#' model <- word2vec(text, dim = 10, iter = 20, type = "cbow", min_count = 1)
#' predict(model, newdata = names(weights), type = "nearest", top_n = 3)
#' embedding <- as.matrix(model)
#' adj <- subset(brussels_reviews_anno, language %in% "nl" & upos %in% "ADJ")
#' adj <- txt_freq(adj$lemma)
#' adj <- subset(adj, freq > 1 & nchar(key) > 1)
#' }
#' data(brussels_reviews_w2v_embeddings_lemma_nl)
#' embedding <- brussels_reviews_w2v_embeddings_lemma_nl
#' adjective <- subset(brussels_reviews_anno, language %in% "nl" & upos %in% "ADJ")
#' adjective <- txt_freq(adjective$lemma)
#' adjective <- subset(adjective, freq >= 5 & nchar(key) > 1)
#' adjective <- adjective$key
#'
#' scores <- dtm_svd_similarity(dtm, embedding, weights = weights, type = "dot",
#' terminology = adj$key)
#' terminology = adjective)
#' scores
#' plot(scores$terminology$similarity_weight, log(scores$terminology$freq),
#' type = "n")
#' text(scores$terminology$similarity_weight, log(scores$terminology$freq),
#' labels = scores$terminology$term)
#' \dontshow{
#' \}
#' # End of main if statement running only if the required packages are installed
#' }
#' labels = scores$terminology$term, cex = 0.8)
dtm_svd_similarity <- function(dtm, embedding, weights, terminology = rownames(embedding), type = c("cosine", "dot")){
doc_id <- term <- prop <- in_terminology <- NULL
embedding_similarity <- function(x, y, type = c("cosine", "dot")) {
Expand Down Expand Up @@ -1358,6 +1357,10 @@ dtm_svd_similarity <- function(dtm, embedding, weights, terminology = rownames(e
if(missing(terminology)){
terminology <- object$terminology
}else{
missing_embeddings <- setdiff(terminology, object$terminology)
if(length(missing_embeddings) > 0){
warning(sprintf("Removing '%s' from terminology as these are not part of rownames(embedding)", paste(missing_embeddings, collapse = ", ")))
}
terminology <- intersect(terminology, object$terminology)
}
not_known_weights <- setdiff(names(weights), object$terminology)
Expand Down Expand Up @@ -1426,9 +1429,10 @@ dtm_svd_similarity <- function(dtm, embedding, weights, terminology = rownames(e
terminology_similarity <- sort(weightspace, decreasing = TRUE)
terminology_similarity <- data.frame(
term = names(terminology_similarity),
freq = txt_recode(names(terminology_similarity), from = names(freq), to = as.integer(freq)),
freq = txt_recode(names(terminology_similarity), from = names(freq), to = as.integer(freq), na.rm = TRUE),
similarity_weight = as.numeric(terminology_similarity),
stringsAsFactors = FALSE)
terminology_similarity$freq <- ifelse(is.na(terminology_similarity$freq), 0, terminology_similarity$freq)

result <- list(weights = weights_scaled,
type = type,
Expand Down
Binary file not shown.
14 changes: 14 additions & 0 deletions man/brussels_reviews_w2v_embeddings_lemma_nl.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

25 changes: 12 additions & 13 deletions man/dtm_svd_similarity.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 93cba4a

Please sign in to comment.