-
Notifications
You must be signed in to change notification settings - Fork 0
/
google_history_analyser.py
107 lines (102 loc) · 3.12 KB
/
google_history_analyser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#-------------------------------------------------------------------------------
# Name : Google History Analyser
# Purpose : Program for analysing the google search history
#
# Author : Rajan Chauhan
# Created : 7 /Dec/2015
# Copyright : https://plus.google.com/+rajanchauhandiscover
# Licence : Creative Commons 1.0 Universal
#-------------------------------------------------------------------------------
import json
import datetime
import re
import os
import codecs
from bs4 import BeautifulSoup
def gen_filename(genfile):
if genfile==False:
filename=input('provide filename : ')
genfile=True
else:
filename=input('File exists provide another name')
genfile=True
if '.' in filename:
return filename,ext
def get_text(elementlist):
return [re.search(r'([a-zA-Z0-9 -.]*)',element.text).group() for element in elementlist]
def save(flist):
global lst
for i in flist["event"]:
for word in i["query"]["query_text"].split():
lst[word]=get_frequency(word)+1
def get_frequency(word):
try:
return lst[word]
except :
return 0
def get_files():
file='index.html'
if file in os.listdir():
index=open(file)
else:
print('No file present')
raise IOError
soup=BeautifulSoup(index.read(),'html.parser')
index.close()
filelist=soup.find_all('div',{'class':'extracted-file-name'})
return get_text(filelist)
def get_top(n):
global lst
if lst is not None:
sorted_lst=sorted(lst.items(),key=lambda x: x[1],reverse=True)
return sorted_lst[:n]
else:
pass
"""For printing the list of touples in pretty manner"""
def prettify(_list):
len_indent= max(len(i[0]) for i in _list)
for i in _list:
indent=(len_indent-len(i[0]))*' '
print(str(i[0])+indent+" : "+str(i[1]))
""" Work on function for generating list """
def main():
global lst
lst={} #For storing the word search frequency
genfile= False # Check for file
"""Specify path to your download folder which contain index.html file"""
path=str(input('provide path :'))
while(not len(path)):
path=str(input('proivde correct path'))
try:
os.chdir(path)
except:
os.chdir(path-'\\')
filenames= get_files()
for file in filenames:
try:
f=codecs.open('Searches\\'+file,encoding='latin-1')
try:
flist=json.load(f)
save(flist)
except UnicodeDecodeError :
print(file)
except IOError :
pass
cache_result=input('Do you want to save file in one y or n :')
if cache_result=='y':
while True:
filename,ext=gen_filename(genfile)
if filename+ext in os.listdir():
genfile=True
continue
else:
genfile=False
break
if ext is '':
file=open(path+filename+'.txt','w')
else:
file=open(path+filename,'w')
file.write(str(lst))
file.close()
if __name__=="__main__":
main()