-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathiterator_getLineTermsFromALargeDoc.py
More file actions
59 lines (47 loc) · 1.22 KB
/
Copy pathiterator_getLineTermsFromALargeDoc.py
File metadata and controls
59 lines (47 loc) · 1.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# -*- coding: utf-8 -*-
'''
@file iterator_getLineTermsFromALargeDoc.py
@author shulong tan ([email protected])
@date 2016-11
@comments iteratively get line term(s) from a large document, iterator and yield implementation
'''
# iterator
class getLineTermsofLongDoc(object):
def __init__(self,file):
self.file=file
def __iter__(self):
return self
def next(self):
line=self.file.readline()
if line:
terms=line.strip().split('\t')
if len(terms)==5:
if terms[4]=="NULL":
terms[4]=""
return terms
else: # skip some bad lines
print len(terms)
self.next()
else: # stop iteration
raise StopIteration()
#yield
def getLineTermsofLongDoc(fpath):
f=open(fpath,'rb')
for line in f:
terms=line.strip().split('\t')
if len(terms)!=5:#skip bad lines as normal functions
continue
if terms[4]=="NULL":
terms[4]=""
yield terms #no return
if __name__ == '__main__':
fpath='test.txt'
#for iterator
with open(fpath,'rb') as f:
for idx, terms in enumerate(getLineTermsofLongDoc(f)):
print idx
print terms
#for yield
for idx, terms in enumerate(getLineTermsofLongDoc(fpath)):
print idx
print terms