Skip to content

Commit c82708c

Browse files
authored
Rotten tomato tomatometer score scrape code
Scraping code for rottentomatoes.com, Tomatometer score, theatrical release date, streaming release date, etc.
1 parent 3044d27 commit c82708c

File tree

1 file changed

+118
-0
lines changed

1 file changed

+118
-0
lines changed
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
This code is just for educational purpose.
4+
Created on Tue Jan 6 2021
5+
6+
@author: Yug Agrawal
7+
"""
8+
import bs4
9+
import pandas as pd
10+
import json
11+
from requests import get
12+
import re
13+
import numpy as np
14+
15+
titles = pd.read_excel("Movie_Name_List.xlsx")
16+
17+
18+
def getIndex(movie_containers, tag):
19+
for index, container in enumerate(movie_containers):
20+
try:
21+
if container.div.text == tag:
22+
return index
23+
break
24+
except:
25+
pass
26+
27+
def Streaming_date(Moviemeta):
28+
try:
29+
Release_Streaming_index = getIndex(Moviemeta, 'Release Date (Streaming):')
30+
Streaming_Release_date = Moviemeta[Release_Streaming_index].text.strip().replace('\n', '').split(':')[1].strip()
31+
except:
32+
Streaming_Release_date = None
33+
34+
return Streaming_Release_date
35+
36+
37+
def Theatre_rlease_date(Moviemeta):
38+
try:
39+
Release_Th_index = getIndex(Moviemeta, 'Release Date (Theaters):')
40+
Theatre_Release_date = Moviemeta[Release_Th_index].text.strip().replace('\n', '').split(':')[1].strip()
41+
except:
42+
Theatre_Release_date = None
43+
44+
return Theatre_Release_date
45+
46+
def title_clean(col_name):
47+
titles[col_name] = titles[col_name].str.replace(":","")
48+
titles[col_name] = titles[col_name].str.replace(" ","_")
49+
titles[col_name] = titles[col_name].str.replace("&","and")
50+
titles[col_name] = titles[col_name].str.replace("'","")
51+
titles[col_name] = titles[col_name].str.replace("-","")
52+
titles[col_name] = titles[col_name].str.replace(",","")
53+
titles[col_name] = titles[col_name].str.replace(".","")
54+
titles[col_name] = titles[col_name].str.replace("__","_")
55+
titles[col_name] = titles[col_name].str.replace("/","")
56+
titles[col_name] = titles[col_name].str.replace("!","")
57+
titles[col_name] = titles[col_name].str.replace("(","")
58+
titles[col_name] = titles[col_name].str.replace(")","")
59+
titles[col_name] = titles[col_name].str.lower()
60+
61+
title_clean('Title')
62+
title_clean('Title_with_year')
63+
64+
titles['RT_Score'] = 0
65+
titles['RT_Score_without_year_flag'] = 0
66+
titles['with_year'] = 0
67+
68+
titles['Theatre_Release_date'] = np.nan
69+
titles['Stream_Release_date'] = np.nan
70+
71+
index_no = 0
72+
for title,Title_with_year in zip(titles.Title,titles.Title_with_year):
73+
print(title)
74+
75+
try:
76+
page_movie = 'https://www.rottentomatoes.com/m/'+Title_with_year
77+
response = get(page_movie)
78+
soup = bs4.BeautifulSoup(response.text, 'lxml')
79+
80+
# Score
81+
score = soup.find_all('span', class_='mop-ratings-wrap__percentage')
82+
titles.loc[index_no,'RT_Score'] = score[0].text.strip().replace('\n', '').split(' ')[0]
83+
titles.loc[index_no,'with_year'] = 1
84+
85+
Moviemeta = soup.find_all('li', class_='meta-row clearfix')
86+
Streaming_Release_date = Streaming_date(Moviemeta)
87+
Theatre_Release_date = Theatre_rlease_date(Moviemeta)
88+
89+
titles.loc[index_no,'Theatre_Release_date'] = Theatre_Release_date
90+
titles.loc[index_no,'Stream_Release_date'] = Streaming_Release_date
91+
92+
index_no = index_no + 1
93+
except:
94+
try:
95+
page_movie = 'https://www.rottentomatoes.com/m/'+title
96+
response = get(page_movie)
97+
soup = bs4.BeautifulSoup(response.text, 'lxml')
98+
99+
# Score
100+
score = soup.find_all('span', class_='mop-ratings-wrap__percentage')
101+
titles.loc[index_no,'RT_Score_without_year_flag'] = score[0].text.strip().replace('\n', '').split(' ')[0]
102+
103+
Moviemeta = soup.find_all('li', class_='meta-row clearfix')
104+
Streaming_Release_date = Streaming_date(Moviemeta)
105+
Theatre_Release_date = Theatre_rlease_date(Moviemeta)
106+
107+
titles.loc[index_no,'Theatre_Release_date'] = Theatre_Release_date
108+
titles.loc[index_no,'Stream_Release_date'] = Streaming_Release_date
109+
110+
index_no = index_no + 1
111+
except:
112+
titles.loc[index_no,'RT_Score'] = "Not Available"
113+
titles.loc[index_no,'RT_Score_without_year_flag'] = "Not Available"
114+
index_no = index_no + 1
115+
pass
116+
117+
118+
titles.to_excel("Rotten_Tomato_Scarpper_code_output.xlsx")

0 commit comments

Comments
 (0)