forked from PedroUria/NLP-Movie_Scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_success_data.py
105 lines (90 loc) · 4.4 KB
/
get_success_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd
import numpy as np
import operator
def get_characters(script, threshold_char=5, threshold_space=-1):
""" Gets the characters of a script """
with open(os.getcwd() + "/scripts/" + script, "r") as s:
script = s.read()
poss_char = {}
for line in script.split("\n"):
if line.count(" ") > threshold_space:
key = line.strip(" ").replace(line[line.find("("):line.find(")") + 1], "").strip(" ").replace("\t", "").strip(" ")
if key not in poss_char:
poss_char[key] = 1
else:
poss_char[key] += 1
sorted_poss_chars = sorted(poss_char.items(), key=operator.itemgetter(1), reverse=True)
chars = []
for char_tuple in sorted_poss_chars:
if char_tuple[1] > threshold_char:
chars.append(char_tuple)
else:
break
return [char[0] for char in chars if char[0] != "" and "-" not in char[0] and "." not in char[0] and ":" not in char[0] and "?" not in char[0]]
# Gets the titles of the movies for which we have scripts
ready_scripts = [file_name[:file_name.find("_")] for file_name
in os.listdir(os.getcwd() + "/scripts/")
if ".txt" in file_name]
n_scripts_before = len(ready_scripts)
for script in ready_scripts:
path = os.getcwd() + "/scripts/" + script + "_script.txt"
with open(path, "r") as s:
m = s.read()
# Deletes scripts that were pdf and thus we got html trash
if len(m) < 3000:
os.remove(path)
continue
# Deletes scripts that do not follow the standard format
# Such us: MULAN: Hey yo what's up?
# Instead of: MULAN
# Hey yo what's up?
if len(get_characters(script + "_script.txt")) == 0:
os.remove(path)
ready_scripts = [file_name[:file_name.find("_")] for file_name
in os.listdir(os.getcwd() + "/scripts/")
if ".txt" in file_name]
print("We removed", n_scripts_before - len(ready_scripts), "scripts") # We removed 23 scripts
print("There are", len(ready_scripts), "scripts now")
# Will store the data for each title
org_title, prep_title = [], []
release_date, prod_budget, dom_gross, world_gross = [], [], [], []
website_url = requests.get("https://www.the-numbers.com/movie/budgets/all").text
def get_data():
""" Scrapes the data from https://www.the-numbers.com/movie/budgets/all """
soup = BeautifulSoup(website_url, "lxml")
movies = soup.find_all("tr")[1:]
for movie in movies:
original_title = movie.find("b").string
# Gets the title that will match the titles we have saved as movie scripts
processed_title = original_title.replace(": ", "-").replace(" ", "-").replace("â\x80\x99", "\'")
if processed_title[:4] == "The-":
processed_title = processed_title[processed_title.find("The-")+4:] + "," + "-The"
# If the script is in our directory, gets the data
if processed_title in ready_scripts and processed_title not in prep_title: # Avoids duplicates
org_title.append(original_title.replace("â\x80\x99", "\'"))
prep_title.append(processed_title)
release_date.append(movie.find("a").string)
numbers = movie.find_all("td", {"class": "data"})
prod_budget.append(int(numbers[1].string[1:].replace(",", "")))
dom_gross.append(int(numbers[2].string[1:].replace(",", "")))
world_gross.append(int(numbers[3].string[1:].replace(",", "")))
get_data()
# Does the same for all the web pages
for i in range(101, 5800, 100):
website_url = requests.get("https://www.the-numbers.com/movie/budgets/all/" + str(i)).text
get_data()
# Creates DataFrame
df = pd.DataFrame({"Processed Title": prep_title, "Release Date": release_date,
"Production Budget ($)": prod_budget, "Domestic Gross ($)": dom_gross,
"Worldwide Gross ($)": world_gross}, index=org_title)
# Replaces 0s with np.Nans and drops them
df.replace(0, np.NaN, inplace=True)
df.dropna(inplace=True)
# Gets ROI columns
df["Domestic ROI (%)"] = (df["Domestic Gross ($)"] - df["Production Budget ($)"])*100/df["Production Budget ($)"]
df["Worldwide ROI (%)"] = (df["Worldwide Gross ($)"] - df["Production Budget ($)"])*100/df["Production Budget ($)"]
# Saves the data as csv
df.to_csv("success_data.csv")