-
Notifications
You must be signed in to change notification settings - Fork 0
/
indexer.py
71 lines (56 loc) · 2.1 KB
/
indexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 11 17:57:13 2018
@author: tpayer
"""
import os, os.path
from whoosh import index
#import whoosh.index as index
from whoosh.fields import *
import glob
from bs4 import BeautifulSoup
# import frogress # frogress.bar no longer available??
from tqdm import tqdm
from whoosh.writing import AsyncWriter
import re
from whoosh.writing import BufferedWriter
def striptags(data):
p = re.compile(r'<.*?>')
return p.sub('', data).strip()
# define a schema for the indexer
def get_schema():
return Schema(docno=ID(unique=True, stored=True),
headline=TEXT(stored=True), path=ID(stored=True), content=TEXT)
def add_doc(writer, path):
infile = open(path,"r").read()
infile = '<root>'+infile+'</root>'
#print(path)
#print(len(infile))
soup = BeautifulSoup(infile, 'xml')
docs = soup.find_all('DOC')
for doc in docs:
docno = striptags(str(doc.DOCNO))
headline = striptags(str(doc.HEADLINE)) # yes, this works, please dont change <- no it doesn't
text = striptags(str(doc.TEXT))
writer.add_document(docno=docno, headline=headline, path=path, content=text)
def index_TREC_ROBUST_04():
schema = get_schema()
# create an index in the "indexdir" directory.
if not os.path.exists("indexdir"):
os.mkdir("indexdir")
ix = index.create_in("indexdir", schema)
ix = index.open_dir("indexdir")
# creating the index writer (if fasil use AsyncWriter
# (https://whoosh.readthedocs.io/en/latest/api/writing.html#whoosh.writing.AsyncWriter))
# writer = ix.writer()
writer = ix.writer()#BufferedWriter(ix, period=120, limit=20)
dir_list = ['latimes', 'fbis', 'fr94', 'ft']
doclist = []
[doclist.extend(glob.glob(os.path.join(".", "data", source, "*"))) for source in dir_list]
# was frogress.bar instead of tqdm
for filename in tqdm(doclist):
add_doc(writer, filename)
writer.commit()
if __name__ == '__main__':
# this function will automatically index the files needed for TREC_Robust_2004
index_TREC_ROBUST_04()