Skip to content

Instantly share code, notes, and snippets.

title: "Get word embedings from Bert"
format: html
editor: source
chunk_output_type: console
Убедимся, что у нас третий питон:
# Get and read the PDF
path <- file.path("os2.pdf")
pdf <- readBin(con = path, what = raw(), n =$size)
# Open it
# Connect to default DB and put seralized raw pdf in a data.frame
from nltk.probability import FreqDist
import math
import pickle
from top2vec import Top2Vec
import numpy as np
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import strip_tags
from tqdm import tqdm
# by G. Moroz
# License: GPL-2
netflix_titles <- read_csv('')
title: "Using Gensim in R"
author: "Adam Lauretig"
date: "3/17/2018"
output: html_document
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
inkrement / clickhousedump
Created August 19, 2017 14:26
dump all clickhouse databases and tables
while read -r db ; do
while read -r table ; do
if [ "$db" == "system" ]; then
echo "skip system db"
continue 2;
tjvananne / process GloVe pre-trained word vector.R
Created May 4, 2017 14:45
How to read and process a downloaded pre-trained GloVe word vector (turn it into a data.frame) in base R
#' A word vector is a giant matrix of words, and each word contains a numeric array that represents the semantic
#' meaning of that word. This is useful so we can discover relationships and analogies between words programmatically.
#' The classic example is "king" minus "man" plus "woman" is most similar to "queen"
# function definition --------------------------------------------------------------------------
# input .txt file, exports list of list of values and character vector of names (words)
proc_pretrained_vec <- function(p_vec) {
graydon /
Created April 23, 2014 00:03
country bounding boxes
# extracted from http//
# under public domain terms
country_bounding_boxes = {
'AF': ('Afghanistan', (60.5284298033, 29.318572496, 75.1580277851, 38.4862816432)),
'AO': ('Angola', (11.6400960629, -17.9306364885, 24.0799052263, -4.43802336998)),
'AL': ('Albania', (19.3044861183, 39.624997667, 21.0200403175, 42.6882473822)),
'AE': ('United Arab Emirates', (51.5795186705, 22.4969475367, 56.3968473651, 26.055464179)),
'AR': ('Argentina', (-73.4154357571, -55.25, -53.628348965, -21.8323104794)),
'AM': ('Armenia', (43.5827458026, 38.7412014837, 46.5057198423, 41.2481285671)),
ks.default <- function(rows) seq(2, max(3, rows %/% 4))
many_kmeans <- function(x, ks = ks.default(nrow(x)), ...) {
ldply(seq_along(ks), function(i) {
cl <- kmeans(x, centers = ks[i], ...)
data.frame(obs = seq_len(nrow(x)), i = i, k = ks[i], cluster = cl$cluster)
all_hclust <- function(x, ks = ks.default(nrow(x)), point.dist = "euclidean", cluster.dist = "ward") {