-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_stats.py
executable file
·124 lines (93 loc) · 3.13 KB
/
generate_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env python3
"""
Generate stats
"""
import argparse
import re
from data import Data
from entities import EntityDatabase
from structure import Novel
ENTITIES_FILENAME = 'entities.json'
WORD_SPLIT = re.compile(r'[ ,.!?]')
DIALOGUE_SEPARATOR_LENGTH = 3
def get_surrounding_speakers(index, dialogue_stats):
current = dialogue_stats[index][0]
before = None
after = None
i = index - 1
count_narrator = 0
while i >= 0 and count_narrator < DIALOGUE_SEPARATOR_LENGTH:
speaker = dialogue_stats[i][0]
if speaker == Data.NARRATOR:
count_narrator += 1
elif speaker == current:
count_narrator = 0
else:
before = speaker
break
i -= 1
count_narrator = 0
i = index + 1
while i < len(dialogue_stats) and count_narrator < DIALOGUE_SEPARATOR_LENGTH:
speaker = dialogue_stats[i][0]
if speaker == Data.NARRATOR:
count_narrator += 1
elif speaker == current:
count_narrator = 0
else:
after = speaker
break
i += 1
return before, after
def process(novel):
edb = EntityDatabase()
edb.load(ENTITIES_FILENAME)
data = Data()
dialogue_stats = []
def process_chapter(chapter):
global dialogue_stats
dialogue_stats = []
def process_chapter_done(chapter):
global dialogue_stats
for i, (speaker, words) in enumerate(dialogue_stats):
if speaker == data.NARRATOR:
continue
before, after = get_surrounding_speakers(i, dialogue_stats)
score = len(words)
if before and after and before != after:
# with different persons before and after this speech
# assume half of it was meant for the last person, half
# of it for the next
score //= 2
if before:
data.add_talked_to(speaker, before, score)
if after:
data.add_talked_to(speaker, after, score)
def process_chunk(chunk):
global dialogue_stats
if chunk.is_direct():
if not chunk.speaker:
# print("unknowns speaker: {}".format(chunk.data['data']))
pass
speaker = chunk.speaker or Data.UNKNOWN
else:
speaker = Data.NARRATOR
words = []
for word in WORD_SPLIT.split(chunk.get_data()):
word = word.lower().strip()
if not word:
continue
words.append(word)
data.add_word(word, speaker)
dialogue_stats.append((speaker, words))
novel.for_each(chapter=process_chapter, chapter_done=process_chapter_done,
chunk=process_chunk)
data.save(args.input + '.stats')
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Grab all statistics from a tagged and entity-annotated novel.')
parser.add_argument('input', type=str, help='input tagged, annotated novel file')
args = parser.parse_args()
novel = Novel(args.input)
print("novel loaded...")
process(novel)