-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathutils.py
More file actions
107 lines (75 loc) · 2.7 KB
/
utils.py
File metadata and controls
107 lines (75 loc) · 2.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import pandas as pd
import numpy as np
import nltk
import re
from nltk.tokenize import word_tokenize, sent_tokenize
import string
from string import punctuation
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
stop_words = set(stopwords.words('english'))
english_words = set(nltk.corpus.words.words())
stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}']) # remove it if you need punctuation
tokenized_xtext = []
for each_sent in incoming_message:
# print(each_sent)
sentences = sent_tokenize(each_sent)
print(sentences)
for each_word in sentences:
print(each_word)
if each_word not in stop_words and each_word not in string.punctuation:
words = word_tokenize(each_word.lower())
tokenized_xtext.append(words)
tokenized_ytext = []
for each_sent in target_message:
# print(each_sent)
sentences = sent_tokenize(each_sent)
print(sentences)
for each_word in sentences:
print(each_word)
if each_word not in stop_words and each_word not in string.punctuation:
words = word_tokenize(each_word.lower())
tokenized_ytext.append(words)
counter = 0
for each_sent in tokenized_ytext:
each_sent.insert(0,'START')
each_sent.append('END')
counter += 1
complete_text = tokenized_xtext + tokenized_ytext
vocab = {}
for each_sent in complete_text:
for each_word in each_sent:
if each_word not in vocab:
vocab[each_word] = 1
else:
vocab[each_word] += 1
token = ['_PAD', '_GO', '_EOS', '_UNK']
vocab_list = token + sorted(vocab, key = vocab.get ,reverse = True)
input_vocab = {}
for each_sent in tokenized_xtext:
for each_word in each_sent:
if each_word not in input_vocab:
input_vocab[each_word] = 1
else:
input_vocab[each_word] += 1
input_vocabulary = sorted(input_vocab, key = input_vocab.get,reverse = True)
input_word2index = {}
for index, word in enumerate(input_vocabulary):
input_word2index[word] = index
input_index2word = {}
for index,word in input_word2index.items():
input_index2word[index] = word
target_vocab = {}
for each_sent in tokenized_ytext:
for each_word in each_sent:
if each_word not in target_vocab:
target_vocab[each_word] = 1
else:
target_vocab[each_word] += 1
target_vocabulary = sorted(target_vocab, key = target_vocab.get, reverse = True)
target_word2index = {}
for index,word in enumerate(target_vocabulary):
target_word2index[word] = index
target_index2word = {}
for index,word in target_word2index.items():
target_index2word[index] = word