Skip to content

[WIP] pandas migration, text edition #97

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
300 changes: 166 additions & 134 deletions orangecontrib/text/corpus.py

Large diffs are not rendered by default.

7 changes: 5 additions & 2 deletions orangecontrib/text/nyt.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import warnings
import datetime
import numpy as np
import pandas as pd
from datetime import date
from html import unescape
from urllib import request, parse
Expand Down Expand Up @@ -42,7 +43,9 @@ def _parse_record_json(records, includes_metadata):
field_value = " ".join([kw["value"] for kw in field_value if kw])
metas_row.append(unescape(field_value) if isinstance(field_value, str) else field_value)
# Add the pub_date.
metas_row.append(tv.parse(doc.get("pub_date", "")))
raw_pub_date = doc.get("pub_date", "")
metas_row.append(tv.column_to_datetime(pd.Series([raw_pub_date]))[0]
if raw_pub_date is not None else raw_pub_date)
# Add the glocation.
metas_row.append(", ".join([kw["value"] for kw in doc["keywords"] if kw["name"] == "glocations"]))

Expand Down Expand Up @@ -85,7 +88,7 @@ def _generate_corpus(records, required_text_fields):

Y = np.array([class_vars[0].to_val(cv) for cv in class_values])[:, None]

return Corpus(None, Y, metas, domain, meta_vars) # used all features
return Corpus(domain, None, np.array(class_values)[:, None], metas, meta_vars) # used all features


class NYT:
Expand Down
11 changes: 5 additions & 6 deletions orangecontrib/text/pubmed.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from datetime import datetime

import numpy as np
import pandas as pd
from Bio import Entrez
from Bio import Medline
from validate_email import validate_email
Expand Down Expand Up @@ -72,15 +73,15 @@ def _date_to_iso(date):
date_string = datetime.strptime(
date, date_format
).date().isoformat()
return time_var.parse(date_string)
return time_var.column_to_datetime(pd.Series([date_string]))[0].timestamp()
except ValueError:
continue # Try the next format.

warnings.warn(
'Could not parse "{}" into a date.'.format(date),
RuntimeWarning
)
return time_var.parse(np.nan)
return np.nan


def _records_to_corpus_entries(records, includes_metadata):
Expand Down Expand Up @@ -153,9 +154,7 @@ def _corpus_from_records(records, includes_metadata):
]
domain = Domain([], class_vars=class_vars, metas=meta_vars)

Y = np.array([class_vars[0].to_val(cv) for cv in class_values])[:, None]

return Corpus(None, Y, meta_values, domain)
return Corpus(domain, None, np.array(class_values)[:, None], meta_values)


class Pubmed:
Expand Down Expand Up @@ -403,7 +402,7 @@ def _retrieve_records(self, num_records,
if corpus is None:
corpus = _corpus_from_records(records, includes_metadata)
else: # Update the corpus.
corpus.extend_corpus(meta_values, class_values)
corpus = corpus.extend_corpus(meta_values, class_values)

return corpus

Expand Down
113 changes: 52 additions & 61 deletions orangecontrib/text/tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,10 @@
from distutils.version import LooseVersion

import numpy as np
from scipy.sparse import csr_matrix, issparse
import scipy.sparse as sp

import Orange
from Orange.data import Table
from Orange.data.domain import Domain, StringVariable
from Orange.data import Table, ContinuousVariable, Domain, StringVariable

from orangecontrib.text import preprocess
from orangecontrib.text.corpus import Corpus
Expand Down Expand Up @@ -43,18 +42,13 @@ def test_corpus_from_file_missing(self):
with self.assertRaises(FileNotFoundError):
Corpus.from_file('missing_file')

def test_corpus_from_init(self):
c = Corpus.from_file('bookexcerpts')
c2 = Corpus(c.X, c.Y, c.metas, c.domain, c.text_features)
self.assertEqual(c, c2)

def test_extend_corpus(self):
c = Corpus.from_file('bookexcerpts')
n_classes = len(c.domain.class_var.values)
c_copy = c.copy()
new_y = [c.domain.class_var.values[int(i)] for i in c.Y]
new_y = np.array(c[c.domain.class_var])
new_y[0] = 'teenager'
c.extend_corpus(c.metas, new_y)
c = c.extend_corpus(c.metas, new_y)

self.assertEqual(len(c), len(c_copy)*2)
self.assertEqual(c.Y.shape[0], c_copy.Y.shape[0]*2)
Expand All @@ -74,31 +68,31 @@ def test_extend_attributes(self):
self.assertEqual(c.X.shape, (len(c), 6))

# extend sparse
c.extend_attributes(csr_matrix(X), ['1', '2', '3'])
c.extend_attributes(sp.csr_matrix(X), ['1', '2', '3'])
self.assertEqual(c.X.shape, (len(c), 9))
self.assertTrue(issparse(c.X))
self.assertTrue(sp.issparse(c.X))

def test_corpus_not_eq(self):
c = Corpus.from_file('bookexcerpts')
n_doc = c.X.shape[0]

c2 = Corpus(c.X, c.Y, c.metas, c.domain, [])
c2 = c.copy()
c2.set_text_features([])
self.assertNotEqual(c, c2)

c2 = Corpus(np.ones((n_doc, 1)), c.Y, c.metas, c.domain, c.text_features)
c2 = c.copy()
c2.domain = Domain([ContinuousVariable("foo")], c.domain.class_vars, c.domain.metas)
c2["foo"] = np.ones(n_doc)
self.assertNotEqual(c, c2)

c2 = Corpus(c.X, np.ones((n_doc, 1)), c.metas, c.domain, c.text_features)
c2 = c.copy()
c2[c2.domain.class_var] = [c2.domain.class_var.values[0]] * n_doc
self.assertNotEqual(c, c2)

broken_metas = np.copy(c.metas)
broken_metas[0, 0] = ''
c2 = Corpus(c.X, c.Y, broken_metas, c.domain, c.text_features)
self.assertNotEqual(c, c2)

new_meta = [StringVariable('text2')]
broken_domain = Domain(c.domain.attributes, c.domain.class_var, new_meta)
c2 = Corpus(c.X, c.Y, c.metas, broken_domain, new_meta)
c2 = c.copy()
c2[c2.domain.metas[0]] = np.ravel(broken_metas)
self.assertNotEqual(c, c2)

c2 = c.copy()
Expand Down Expand Up @@ -157,14 +151,13 @@ def test_documents_from_features(self):
def test_documents_from_sparse_features(self):
t = Table.from_file('brown-selected')
c = Corpus.from_table(t.domain, t)
c.X = csr_matrix(c.X)

# docs from X, Y and metas
docs = c.documents_from_features([t.domain.attributes[0], t.domain.class_var, t.domain.metas[0]])
self.assertEqual(len(docs), len(t))
for first_attr, class_val, meta_attr, d in zip(t.X[:, 0], c.Y, c.metas[:, 0], docs):
for first_attr, class_val, meta_attr, d in zip(t.X[:, 0], c.Y.toarray(), c.metas[:, 0], docs):
first_attr = c.domain.attributes[0].str_val(first_attr)
class_val = c.domain.class_var.str_val(class_val)
class_val = c.domain.class_var.values[int(class_val)]
meta_attr = c.domain.metas[0].str_val(meta_attr)
self.assertIn(class_val, d)
self.assertIn(first_attr, d)
Expand All @@ -178,48 +171,46 @@ def test_documents_from_sparse_features(self):
self.assertIn(first_attr, d)

def test_getitem(self):
c = Corpus.from_file('bookexcerpts')

# without preprocessing
self.assertEqual(len(c[:, :]), len(c))

# run default preprocessing
c.tokens

sel = c[:, :]
self.assertEqual(sel, c)

sel = c[0]
self.assertEqual(len(sel), 1)
self.assertEqual(len(sel._tokens), 1)
np.testing.assert_equal(sel._tokens, np.array([c._tokens[0]]))
self.assertEqual(sel._dictionary, c._dictionary)

sel = c[0:5]
self.assertEqual(len(sel), 5)
self.assertEqual(len(sel._tokens), 5)
np.testing.assert_equal(sel._tokens, c._tokens[0:5])
self.assertEqual(sel._dictionary, c._dictionary)

ind = [3, 4, 5, 6]
sel = c[ind]
self.assertEqual(len(sel), len(ind))
self.assertEqual(len(sel._tokens), len(ind))
np.testing.assert_equal(sel._tokens, c._tokens[ind])
self.assertEqual(sel._dictionary, c._dictionary)
self.assertEqual(sel.text_features, c.text_features)
self.assertEqual(sel.ngram_range, c.ngram_range)
self.assertEqual(sel.attributes, c.attributes)
# does not currently work, because of a bug when slicing single
# rows of multi-dtype sparsedataframes, pandas PR pending
with self.assertRaises(Exception):
c = Corpus.from_file('bookexcerpts')

# run default preprocessing
c.tokens

sel = c.iloc[0]
self.assertEqual(len(sel), 1)
self.assertEqual(len(sel._tokens), 1)
np.testing.assert_equal(sel._tokens, np.array([c._tokens[0]]))
self.assertEqual(sel._dictionary, c._dictionary)

sel = c.iloc[0:5]
self.assertEqual(len(sel), 5)
self.assertEqual(len(sel._tokens), 5)
np.testing.assert_equal(sel._tokens, c._tokens[0:5])
self.assertEqual(sel._dictionary, c._dictionary)

ind = [3, 4, 5, 6]
sel = c.iloc[ind]
self.assertEqual(len(sel), len(ind))
self.assertEqual(len(sel._tokens), len(ind))
np.testing.assert_equal(sel._tokens, c._tokens[ind])
self.assertEqual(sel._dictionary, c._dictionary)
self.assertEqual(sel.text_features, c.text_features)
self.assertEqual(sel.ngram_range, c.ngram_range)
self.assertEqual(sel.attributes, c.attributes)

def test_asserting_errors(self):
c = Corpus.from_file('bookexcerpts')

with self.assertRaises(TypeError):
Corpus(1.0, c.Y, c.metas, c.domain, c.text_features)
with self.assertRaises(AttributeError):
# float has no attribute size
Corpus(c.domain, 1.0, c.Y, c.metas, c.text_features)

too_large_x = np.vstack((c.X, c.X))
too_large_y = np.vstack((c.Y, c.Y))
with self.assertRaises(ValueError):
Corpus(too_large_x, c.Y, c.metas, c.domain, c.text_features)
Corpus(c.domain, c.X, too_large_y, c.metas, c.text_features)

with self.assertRaises(ValueError):
c.set_text_features([StringVariable('foobar')])
Expand All @@ -228,7 +219,7 @@ def test_asserting_errors(self):
c.set_text_features([c.domain.metas[0], c.domain.metas[0]])

c.tokens # preprocess
with self.assertRaises(TypeError):
with self.assertRaises(KeyError):
c[..., 0]

def test_copy(self):
Expand Down
2 changes: 1 addition & 1 deletion orangecontrib/text/tests/test_nyt.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def test_nyt_corpus_domain_generation(self):
[StringVariable.make('pub_date'), StringVariable.make('country')]

self.assertEqual(len(meta_vars), len(corpus.domain.metas))
self.assertEqual(len(corpus.Y), 10)
self.assertEqual(len(corpus), 10)

def test_nyt_result_caching(self):
# Run a query to create a cache entry first.
Expand Down
11 changes: 5 additions & 6 deletions orangecontrib/text/tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,13 @@ def transform(cls, string):
return string[:-1]
p = Preprocessor(transformers=StripStringTransformer())

np.testing.assert_equal(p(self.corpus).tokens,
np.array([[doc[:-1]] for doc in self.corpus.documents]))
self.assertSequenceEqual(list(p(self.corpus).tokens), [[d[:-1]] for d in self.corpus.documents])

p = Preprocessor(transformers=[StripStringTransformer(),
preprocess.LowercaseTransformer()])

np.testing.assert_equal(p(self.corpus).tokens,
np.array([[doc[:-1].lower()] for doc in self.corpus.documents]))
self.assertSequenceEqual(list(p(self.corpus).tokens),
[[doc[:-1].lower()] for doc in self.corpus.documents])

self.assertRaises(TypeError, Preprocessor, string_transformers=1)

Expand All @@ -60,8 +59,8 @@ def normalize(cls, token):
return token.capitalize()
p = Preprocessor(normalizer=CapTokenNormalizer())

np.testing.assert_equal(p(self.corpus).tokens,
np.array([[sent.capitalize()] for sent in self.corpus.documents]))
self.assertSequenceEqual(list(p(self.corpus).tokens),
[[sent.capitalize()] for sent in self.corpus.documents])

def test_token_filter(self):
class SpaceTokenizer(preprocess.BaseTokenizer):
Expand Down
4 changes: 2 additions & 2 deletions orangecontrib/text/tests/test_topic_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,15 @@ def test_fit_transform(self):

def test_get_topic_table_by_id(self):
self.model.fit(self.corpus)
topic1 = self.model.get_topics_table_by_id(1)
topic1 = self.model.get_topics_table_by_id(0)
self.assertEqual(len(topic1), len(self.corpus.dictionary))
self.assertEqual(topic1.metas.shape, (len(self.corpus.dictionary), 2))
# self.assertAlmostEqual(topic1.W.sum(), 1.)
self.assertFalse(any(topic1.W == np.nan))

def test_top_words_by_topic(self):
self.model.fit(self.corpus)
words = self.model.get_top_words_by_id(1, num_of_words=10)
words = self.model.get_top_words_by_id(0, num_of_words=10)
self.assertTrue(all([isinstance(word, str) for word in words]))
self.assertEqual(len(words), 10)

Expand Down
2 changes: 1 addition & 1 deletion orangecontrib/text/tests/test_twitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def test_create_corpus(self):
self.assertIsInstance(corpus, Corpus)
self.assertEqual(len(corpus), 5)

def test_crate_corpus_attr_selection(self):
def test_create_corpus_attr_selection(self):
self.api.search(word_list=['hello'], max_tweets=5)
self.api.join()
attributes = ['text', 'created_at', 'author_id']
Expand Down
12 changes: 4 additions & 8 deletions orangecontrib/text/topics/topics.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
import numpy as np
from gensim.corpora import Dictionary

from Orange.data import StringVariable, ContinuousVariable, Domain
from Orange.data.table import Table
from Orange.data import StringVariable, ContinuousVariable, Domain, Table, TableSeries
from orangecontrib.text.corpus import Corpus


Expand All @@ -22,12 +21,9 @@ def chunks(iterable, chunk_size):


class Topics(Table):
""" Dummy wrapper for Table so signals can distinguish Topics from Data.
"""Dummy wrapper for Table so signals can distinguish Topics from Data.
"""

def __new__(cls, *args, **kwargs):
""" Bypass Table.__new__. """
return object.__new__(Topics)
pass


class GensimWrapper:
Expand Down Expand Up @@ -66,7 +62,7 @@ def reset_model(self, corpus):
_update = self.Model.update
self.Model.update = self.dummy_method
self.id2word = Dictionary(corpus.ngrams, prune_at=None)
self.model = self.Model(corpus=corpus,
self.model = self.Model(corpus=corpus.ngrams_corpus,
id2word=self.id2word, **self.kwargs)
self.Model.update = _update

Expand Down
4 changes: 2 additions & 2 deletions orangecontrib/text/twitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def create_corpus(self, included_attributes=None):
def to_val(attr, val):
if isinstance(attr, data.DiscreteVariable) and val not in attr.values:
attr.add_value(val)
return attr.to_val(val)
return val

X = np.array([
[to_val(attr, record[attr.name]) for attr, _ in attributes]
Expand All @@ -206,7 +206,7 @@ def to_val(attr, val):
], dtype=object)
self.statuses_lock.release()

return Corpus(X=X, metas=metas, domain=domain, text_features=text_features)
return Corpus(domain, X, None, metas, text_features)

def reset(self):
""" Removes all downloaded tweets. """
Expand Down
4 changes: 2 additions & 2 deletions orangecontrib/text/vectorization/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,6 @@ def report(self):
def add_features(corpus, X, dictionary):
order = np.argsort([dictionary[i] for i in range(len(dictionary))])
corpus.extend_attributes(X[:, order],
feature_names=(dictionary[i] for i in order),
feature_names=[dictionary[i] for i in order],
var_attrs={'hidden': True})
corpus.ngrams_corpus = matutils.Sparse2Corpus(X.T)
corpus.store_ngrams_corpus([dictionary[i] for i in order])
2 changes: 1 addition & 1 deletion orangecontrib/text/vectorization/simhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def _transform(self, corpus):

X = np.array([self.int2binarray(self.compute_hash(doc)) for doc in corpus.tokens], dtype=np.float)
corpus = corpus.copy()
corpus.extend_attributes(X, ('simhash_{}'.format(int(i) + 1) for i in range(self.f)),
corpus.extend_attributes(X, ['simhash_{}'.format(int(i) + 1) for i in range(self.f)],
var_attrs={'hidden': True})
return corpus

Expand Down
Loading