It is time to do a little clean up with the information collected from the Scraper. Another pass through using Director Film Dataframe to see if there are new movies to add and to add IMDB id to director film list
import pandas as pd
from imdb import Cinemagoer
import re
import os.path
ia = Cinemagoer()
file = 'missing_directors.csv'
check_missing_directors_file = os.path.isfile(file)
df_director_films = pd.read_csv('imdb_movies_directors.csv', encoding= 'unicode_escape')
df_director_names = pd.read_csv('director_names.csv', encoding='unicode_escape')
for name in df_director_names.itertuples():
for director in df_director_films.itertuples():
if name.Name == director.Name:
df_director_names.at[name.Index, 'Person_ID'] = int(director.Person_ID)
df_director_names['Person_ID'] = df_director_names['Person_ID'].fillna(0)
df_director_names = df_director_names.astype({"Person_ID": int})
df_director_names = df_director_names.drop("Unnamed: 0", axis=1)
df_director_names.to_csv("director_names_imdb_person_id.csv")
df_director_names
| Name | wiki_link | Person_ID | |
|---|---|---|---|
| 0 | Abdisalam Aato | https://en.wikipedia.org/wiki/Abdisalam_Aato | 0 |
| 1 | Gay Abel-Bey | https://en.wikipedia.org/wiki/Gay_Abel-Bey | 8498 |
| 2 | Fathia Absie | https://en.wikipedia.org/wiki/Fathia_Absie | 7280958 |
| 3 | Anita W. Addison | https://en.wikipedia.org/wiki/Anita_W._Addison | 11697 |
| 4 | Omowale Akintunde | https://en.wikipedia.org/wiki/Omowale_Akintunde | 0 |
| ... | ... | ... | ... |
| 313 | Tricia Woodgett | https://en.wikipedia.org/wiki/Tricia_Woodgett | 0 |
| 314 | Bille Woodruff | https://en.wikipedia.org/wiki/Bille_Woodruff | 937306 |
| 315 | Fronza Woods | https://en.wikipedia.org/wiki/Fronza_Woods | 0 |
| 316 | Tanya Wright | https://en.wikipedia.org/wiki/Tanya_Wright | 942859 |
| 317 | Phillip Youmans | https://en.wikipedia.org/wiki/Phillip_Youmans | 0 |
318 rows × 3 columns
#Currently as of 6/23/2023 getting biography information like birth dates is not working
#https://github.com/cinemagoer/cinemagoer/issues/462
person = ia.get_person('0000332', info=['biography'])
print(person.infoset2keys)
{'biography': []}
Checking for directors that were not found automatically in IMDB
Creating a list of missing directors and then manually seeing if they have a imdb id to use to add to imdb_movies_directors.csv
def imdb_search (name):
imdb_people_search = []
people = ia.search_person(name)
for person in people:
if person['name'] == name:
imdb_people_search.append([person.personID, person['name']])
if len(imdb_people_search) > 1 or not len(imdb_people_search):
imdb_people_search = 0
return imdb_people_search
# if there isn't already a missing_directors.csv then create one
# else take the missing_directors.csv and check to see if any directors can
# be added to the director_names_imdb_person_id.csv
# also their movies added to imdb_movies_diretors to the end of that list
if not check_missing_directors_file:
missing_directors_df = df_director_names.loc[df_director_names['Person_ID'] == 0]
missing_directors_df.to_csv("missing_directors.csv")
missing_directors_df
else:
missing_directors_df = pd.read_csv(file, encoding= 'unicode_escape')
missing = ia.search_person(missing_directors_df.loc[7]['Name'])
print(missing)
madeline = ia.get_person('7507414')
print(madeline)
if madeline.has_key('director'):
for movies in madeline['filmography']['director']:
year = 0000 if 'year' not in movies else int(movies['year'])
print(movies['title'], year)
[<Person id:1919862[http] name:_Mya Baker_>,
<Person id:7983695[http] name:_Mya Baker_>,
<Person id:1263939[http] name:_Danielle Panabaker_>,
<Person id:0659048[http] name:_Kay Panabaker_>,
<Person id:6803655[http] name:_Julia Maren Baker_>,
<Person id:5208385[http] name:_Brianna Baker_>,
<Person id:1019674[http] name:_Sala Baker_>,
<Person id:1128564[http] name:_Kitana Baker_>,
<Person id:10453571[http] name:_Malia Baker_>,
<Person id:0000711[http] name:_Rick Baker_>,
<Person id:6966400[http] name:_Stella Baker_>,
<Person id:12140549[http] name:_Maya Baker_>,
<Person id:4124843[http] name:_Mia Baker_>,
<Person id:11325477[http] name:_Eva Brooke Baker_>,
<Person id:9698063[http] name:_Ekaterina Baker_>,
<Person id:1179722[http] name:_Anita Baker_>,
<Person id:0851963[http] name:_Andrea Baker_>,
<Person id:0672060[http] name:_D.A. Pennebaker_>,
<Person id:4080841[http] name:_Lucy May Barker_>,
<Person id:2531646[http] name:_Shauna Baker_>]
Madeline Anderson
Being Me 1975
I Am Somebody 1970
A Tribute to Malcolm X 1967
Integration Report I 1960
def update_directors_name(tmpID, tmpName):
for name in df_director_names.itertuples():
if name.Name == tmpName:
missing_directors_df.at[name.Index, 'Person_ID'] = int(tmpID)
for n in range(0, len(missing_directors_df.index)):
imdb_person = imdb_search(missing_directors_df['Name'][n])
if imdb_person == 0:
continue
else:
person = ia.get_person(imdb_person[0][0])
print(person)
if person.has_key('director'):
for movies in person['filmography']['director']:
update_directors_name(imdb_person[0][0], imdb_person[0][1])
missing_directors_df.to_csv("missing_directors.csv")
def wiki_imdb(imdb_id, wiki_name):
imdb_person = [imdb_id, wiki_name]
person = ia.get_person(imdb_person[0])
if person.has_key('director'):
for movies in person['filmography']['director']:
year = 0000 if 'year' not in movies else int(movies['year'])
df_director_films.loc[len(df_director_films.index)] = [movies.movieID, movies['title'], year, imdb_person[0], imdb_person[1]]
update_directors_name(imdb_person[0], imdb_person[1])
wiki_imdb('10555174', 'Robert Van Lierop')
df_director_films.to_csv("Director_Movies_Clean.csv", index=False)
from bs4 import BeautifulSoup
import requests
imdb_re = 'https://www.imdb.com/name/'
total = 0
for i in range(0, len(test_df.index)):
wiki_url = "https://en.wikipedia.org" + test_df['wiki_link'][i]
print(wiki_url)
table_check = False
try:
result = requests.get(wiki_url)
result.raise_for_status()
doc = BeautifulSoup(result.text, "html.parser")
except Exception as e:
print(e)
imdb_id = doc.find_all(rel="nofollow")
for imdb in imdb_id:
if re.search(imdb_re, imdb.get('href')):
# print(imdb)
print(imdb.text)
id = re.search('\d+', imdb.get("href"))
print(id.group())
wiki_imdb(id.group(), imdb.text)
table_check = True
if not table_check:
total += 1
print(f"Total: {total} - There wasn't a match for {wiki_url}")
df_director_films.describe()
| Year | |
|---|---|
| count | 6611.000000 |
| mean | 1797.361972 |
| std | 601.922601 |
| min | 0.000000 |
| 25% | 1977.000000 |
| 50% | 2001.000000 |
| 75% | 2013.000000 |
| max | 2027.000000 |
