1+ # -*- coding: utf-8 -*-
2+ """
3+ This code is just for educational purpose.
4+ Created on Tue Jan 6 2021
5+
6+ @author: Yug Agrawal
7+ """
8+ import bs4
9+ import pandas as pd
10+ import json
11+ from requests import get
12+ import re
13+ import numpy as np
14+
15+ titles = pd .read_excel ("Movie_Name_List.xlsx" )
16+
17+
18+ def getIndex (movie_containers , tag ):
19+ for index , container in enumerate (movie_containers ):
20+ try :
21+ if container .div .text == tag :
22+ return index
23+ break
24+ except :
25+ pass
26+
27+ def Streaming_date (Moviemeta ):
28+ try :
29+ Release_Streaming_index = getIndex (Moviemeta , 'Release Date (Streaming):' )
30+ Streaming_Release_date = Moviemeta [Release_Streaming_index ].text .strip ().replace ('\n ' , '' ).split (':' )[1 ].strip ()
31+ except :
32+ Streaming_Release_date = None
33+
34+ return Streaming_Release_date
35+
36+
37+ def Theatre_rlease_date (Moviemeta ):
38+ try :
39+ Release_Th_index = getIndex (Moviemeta , 'Release Date (Theaters):' )
40+ Theatre_Release_date = Moviemeta [Release_Th_index ].text .strip ().replace ('\n ' , '' ).split (':' )[1 ].strip ()
41+ except :
42+ Theatre_Release_date = None
43+
44+ return Theatre_Release_date
45+
46+ def title_clean (col_name ):
47+ titles [col_name ] = titles [col_name ].str .replace (":" ,"" )
48+ titles [col_name ] = titles [col_name ].str .replace (" " ,"_" )
49+ titles [col_name ] = titles [col_name ].str .replace ("&" ,"and" )
50+ titles [col_name ] = titles [col_name ].str .replace ("'" ,"" )
51+ titles [col_name ] = titles [col_name ].str .replace ("-" ,"" )
52+ titles [col_name ] = titles [col_name ].str .replace ("," ,"" )
53+ titles [col_name ] = titles [col_name ].str .replace ("." ,"" )
54+ titles [col_name ] = titles [col_name ].str .replace ("__" ,"_" )
55+ titles [col_name ] = titles [col_name ].str .replace ("/" ,"" )
56+ titles [col_name ] = titles [col_name ].str .replace ("!" ,"" )
57+ titles [col_name ] = titles [col_name ].str .replace ("(" ,"" )
58+ titles [col_name ] = titles [col_name ].str .replace (")" ,"" )
59+ titles [col_name ] = titles [col_name ].str .lower ()
60+
61+ title_clean ('Title' )
62+ title_clean ('Title_with_year' )
63+
64+ titles ['RT_Score' ] = 0
65+ titles ['RT_Score_without_year_flag' ] = 0
66+ titles ['with_year' ] = 0
67+
68+ titles ['Theatre_Release_date' ] = np .nan
69+ titles ['Stream_Release_date' ] = np .nan
70+
71+ index_no = 0
72+ for title ,Title_with_year in zip (titles .Title ,titles .Title_with_year ):
73+ print (title )
74+
75+ try :
76+ page_movie = 'https://www.rottentomatoes.com/m/' + Title_with_year
77+ response = get (page_movie )
78+ soup = bs4 .BeautifulSoup (response .text , 'lxml' )
79+
80+ # Score
81+ score = soup .find_all ('span' , class_ = 'mop-ratings-wrap__percentage' )
82+ titles .loc [index_no ,'RT_Score' ] = score [0 ].text .strip ().replace ('\n ' , '' ).split (' ' )[0 ]
83+ titles .loc [index_no ,'with_year' ] = 1
84+
85+ Moviemeta = soup .find_all ('li' , class_ = 'meta-row clearfix' )
86+ Streaming_Release_date = Streaming_date (Moviemeta )
87+ Theatre_Release_date = Theatre_rlease_date (Moviemeta )
88+
89+ titles .loc [index_no ,'Theatre_Release_date' ] = Theatre_Release_date
90+ titles .loc [index_no ,'Stream_Release_date' ] = Streaming_Release_date
91+
92+ index_no = index_no + 1
93+ except :
94+ try :
95+ page_movie = 'https://www.rottentomatoes.com/m/' + title
96+ response = get (page_movie )
97+ soup = bs4 .BeautifulSoup (response .text , 'lxml' )
98+
99+ # Score
100+ score = soup .find_all ('span' , class_ = 'mop-ratings-wrap__percentage' )
101+ titles .loc [index_no ,'RT_Score_without_year_flag' ] = score [0 ].text .strip ().replace ('\n ' , '' ).split (' ' )[0 ]
102+
103+ Moviemeta = soup .find_all ('li' , class_ = 'meta-row clearfix' )
104+ Streaming_Release_date = Streaming_date (Moviemeta )
105+ Theatre_Release_date = Theatre_rlease_date (Moviemeta )
106+
107+ titles .loc [index_no ,'Theatre_Release_date' ] = Theatre_Release_date
108+ titles .loc [index_no ,'Stream_Release_date' ] = Streaming_Release_date
109+
110+ index_no = index_no + 1
111+ except :
112+ titles .loc [index_no ,'RT_Score' ] = "Not Available"
113+ titles .loc [index_no ,'RT_Score_without_year_flag' ] = "Not Available"
114+ index_no = index_no + 1
115+ pass
116+
117+
118+ titles .to_excel ("Rotten_Tomato_Scarpper_code_output.xlsx" )
0 commit comments