Exploratory Data Analysis on Black Directors - The Analysis

import pandas as pd
import re
import imdb
import time

ia = imdb.Cinemagoer()
df_director_films = pd.read_csv('Director_Movies_Clean.csv', encoding= 'utf-8-sig')
df_director_names = pd.read_csv('director_names.csv', encoding='unicode_escape')
kinds = {}

df_director_films

	Movie_ID	Title	Year	Person_ID	Name
0	3501074	Madam Secretary (2015-2017)	0	151	Morgan Freeman
1	106464	Bopha!	1993	151	Morgan Freeman
2	995854	A Journal for Jordan	2021	243	Denzel Washington
3	2671706	Fences	2016	243	Denzel Washington
4	413573	Grey's Anatomy	2016	243	Denzel Washington
...	...	...	...	...	...
6606	8571700	Remembrance: A Portrait Study	1967	9913311	Edward Owens
6607	12127402	Autrefois J'ai aime une femme	1966	9913311	Edward Owens
6608	12666216	Sisters of the Screen - African Women in Cinema	1966	11720380	Beti Ellerson
6609	12666216	Sisters of the Screen - African Women in Cinema	2002	11720380	Beti Ellerson
6610	10038432	A Luta Continua	1972	10555174	Robert Van Lierop

6611 rows × 5 columns

Removed all Duplicates from Dataframe

df_movies = df_director_films.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)

df_movies

	Movie_ID	Title	Year	Person_ID	Name
0	3501074	Madam Secretary (2015-2017)	0	151	Morgan Freeman
1	106464	Bopha!	1993	151	Morgan Freeman
2	995854	A Journal for Jordan	2021	243	Denzel Washington
3	2671706	Fences	2016	243	Denzel Washington
4	413573	Grey's Anatomy	2016	243	Denzel Washington
...	...	...	...	...	...
6606	8571700	Remembrance: A Portrait Study	1967	9913311	Edward Owens
6607	12127402	Autrefois J'ai aime une femme	1966	9913311	Edward Owens
6608	12666216	Sisters of the Screen - African Women in Cinema	1966	11720380	Beti Ellerson
6609	12666216	Sisters of the Screen - African Women in Cinema	2002	11720380	Beti Ellerson
6610	10038432	A Luta Continua	1972	10555174	Robert Van Lierop

6316 rows × 5 columns

Remove all the movies with no year (possibly television shows)

df_movies = df_movies[df_movies.Year != 0]

df_movies.describe()

	Movie_ID	Year	Person_ID
count	5.699000e+03	5699.000000	5.699000e+03
mean	4.781896e+06	1998.656431	9.356992e+05
std	4.761429e+06	19.859445	1.235733e+06
min	1.187000e+04	1919.000000	1.510000e+02
25%	3.851770e+05	1991.000000	2.288530e+05
50%	3.223890e+06	2004.000000	6.085750e+05
75%	8.135978e+06	2014.000000	9.373060e+05
max	2.107769e+07	2027.000000	1.172038e+07

df_movies.loc[df_movies['Year'] == 2021]

	Movie_ID	Title	Year	Person_ID	Name
2	995854	A Journal for Jordan	2021	243	Denzel Washington
19	14324128	NYC Epicenters 9/11-2021ÃÂÃÂ½	2021	490	Spike Lee
185	13935770	Chris Rock Total Blackout: The Tamborine Exten...	2021	1674	Chris Rock
267	10608454	Harlem	2021	2700	Malcolm D. Lee
268	3554046	Space Jam: A New Legacy	2021	2700	Malcolm D. Lee
...	...	...	...	...	...
6487	12223626	Black Seeds: The History of Africans in America	2021	5679045	Bayer Mack
6529	8537542	Cold Blood	2021	6462306	Taylor Ri'chard
6541	13814666	Pursued	2021	7280958	Fathia Absie
6552	15596914	Voodoo	2021	7385237	Phillip Youmans
6604	11753636	Private Imaginings and Narrative Facts	2021	9913311	Edward Owens

173 rows × 5 columns

df_movies.plot(x = 'Title', y='Year', kind = 'scatter')

AxesSubplot

import matplotlib.pyplot as plt
import seaborn as sns

sns.histplot(df_movies.Year, kde=True)
plt.show()

AxesSubplot

df_movies['Year'].describe()

count    5699.000000
mean     1998.656431
std        19.859445
min      1919.000000
25%      1991.000000
50%      2004.000000
75%      2014.000000
max      2027.000000
Name: Year, dtype: float64

def df_director_id(name: str):
    for director in df_movies.itertuples():
        if director.Name == name:
            return (str(director.Person_ID).zfill(7))

def movie_format(movie_id):
    if movie_id == None:
        return None
    
    if len(movie_id) > 1:
        return ', '.join(map(str, movie_id))
    else:
        return ' '.join(map(str, movie_id))


def movie_metascore(movie_id):
    return movie_id['metascore']


def movie_budget(movie_id):
    box_off = movie_id.get('box office', None)
    if box_off == None:
        return None
    else:
        budget = re.search('[0-9,]*\d', str(movie_id['box office']))
            
    return  budget.group(0)
    
#tstmovie = ia.get_movie('13972246', info=['main', 'critic_reviews', 'vote details', 'plot'])
#print(tstmovie['box office'])
#budget = re.search('[0-9,]*\d', '$1') #str(tstmovie['box office']))
#print(budget)
#print(movie_budget(tstmovie))
# print(movie_metascore(tstmovie))

df_movies['Type'] = ""
df_movies['Genres'] = ""
df_movies['Metascore'] = ""
df_movies['Budget'] = ""
df_movies['Votes'] = ""
df_movies['Rating'] = ""
df_movies['Runtime'] = ""
df_movies

df_movies = df_movies.reset_index()

df_movies.head(20)

#x = 0
for i in range (6049, 6316): #df_movies.itertuples():
    if i % 150 == 0:
        print(f"{df_movies['Movie_ID'][i]} and  {df_movies['Title'][i]}") 
        time.sleep(180) # 3 minutes
    movie = ia.get_movie(df_movies['Movie_ID'][i], info=['main', 'critic_reviews', 'vote details', 'plot']) #ia.get_movie(df_movies['Movie_ID'][i])
    df_movies.loc[i, 'Type']        = movie.get('kind', None)
    df_movies.loc[i, 'Runtime']     = movie_format(movie.get('runtime', None)) #movie.get('runtime', None)
    df_movies.loc[i, 'Rating']      = movie.get('rating', None)
    df_movies.loc[i, 'Votes']       = movie.get('votes', None)
    df_movies.loc[i, 'Genres']      = movie_format(movie.get('genres', None)) #str(movie.get('genres', None))
    df_movies.loc[i, 'Metascore']   = movie.get('metascore', None)
    df_movies.loc[i, 'Budget']      = movie_budget(movie)
    #x +=1

    #df_movies

    # if x == 10:
    #     break
#df_movies.head(10)

9847438 and  Football Town: Cleveland Ohio
6340186 and  Film 13

df_movies.to_csv("Director_Movies_Analyze.csv", encoding='utf-8-sig')

for Movie_ID in df_movies.columns[1:]:
    print(df_movies[Movie_ID])

spike_lee = ia.get_person('0000490')
sidney_poitier = ia.get_person('0001627')
tvshow = ia.get_movie('209093', info='critic_reviews')

print(f"{spike_lee['birth date']}  {spike_lee['height']} {spike_lee['birth info']} ")
print(f"{sidney_poitier['birth date']}  {sidney_poitier['height']} {sidney_poitier['birth info']} ")

1957-03-20  5' 7" (1.7 m) {'birth place': 'Atlanta, Georgia, USA'} 
1927-02-20  6' 2½" (1.89 m) {'birth place': 'Miami, Florida, USA'}

tvshow.infoset2keys

{'critic_reviews': []}

mv = ia.get_movie('790770', info=['main', 'critic_reviews', 'vote details', 'plot'])
#metascore = ia.get_movie_critic_reviews('7349662')

def person_height_ft(person_id):
    person_height = person_id.get('height', None)
    print(person_height)
    if person_height == None:
        return None
    else:
        meter = re.search('[0-9\']+\s[0-9]+.?', str(person_height))
        height = re.search('\([0-9]+\.?[0-9]*', str(person_height))
        height = str(height.group(0)).replace('(','')
    return  meter.group(0)

tstperson = ia.get_person('0002700')
#str(director.Person_ID).zfill(7)
#print(tstmovie['box office'])
#budget = re.search('[0-9,]*\d', '$1') #str(tstmovie['box office']))
#print(budget)
birth_list = tstperson.get('birth info', None)
#city, state, country = str(birth_list['birth place']).split(", ")

#city, state, country = birth_list.split(',')
#print(tstperson['mini biography'])
#print(tstperson.get('height', None))
print(birth_list)
#print(f"{city} adfsfd {state} aerrod {country}")
#tstperson['birth info']
# print(movie_metascore(tstmovie))

{'birth place': 'USA'}

tstperson.infoset2keys

{'main': ['birth info', 'headshot', 'filmography', 'imdbID', 'name'],
 'biography': ['headshot',
  'birth name',
  'height',
  'mini biography',
  'trade mark',
  'trivia',
  'quotes',
  'birth date',
  'birth notes']}

df_director_names = pd.read_csv("director_names_analyze.csv", index_col=[0], encoding="utf-8-sig")

for i in range (67, 311): #df_director_names.itertuples():
    if df_director_names['Person_ID'][i] == 0:
        continue
    if i % 100 == 0:
        print(f"{df_director_names['Person_ID'][i]} and  {df_director_names['Name'][i]}") 
        time.sleep(180) # 3 minutes
    director = ia.get_person(str(df_director_names['Person_ID'][i]).zfill(7)) 

    person_height = director.get('height', "")
    print(person_height)
    if person_height == "":
        height_m = height_ft = None
    else:    
        height_ft = re.search('[0-9\']*\s[0-9]*.?', str(person_height))
        height_ft = height_ft.group(0)
        height_m = re.search('\([0-9]+\.?[0-9]*', str(person_height))
        height_m = str(height_m.group(0)).replace('(','')
    
    birth_list = tstperson.get('birth info', "")
    if birth_list == "":
        birth_city = birth_state = birth_country = None
    else:
        birth_city, birth_state, birth_country = str(birth_list['birth place']).split(", ")
    
    df_director_names.loc[i, 'Birth Date']  = director.get('birth date', None)
    df_director_names.loc[i, 'Height ft']   = height_ft
    df_director_names.loc[i, 'Height m']    = height_m
    df_director_names.loc[i, 'Birth City']       = birth_city
    df_director_names.loc[i, 'Birth State']      =  birth_state
    df_director_names.loc[i, 'Birth Country']   =  birth_country
    df_director_names.loc[i, 'Headshot']   =  director.get('headshot', None)
    try:
        df_director_names.loc[i, 'Biography']   =  director.get('biography', None)
    except:
        df_director_names.loc[i, 'Biography'] = "Error"
        print("test")

df_director_names['Birth Date'] = ""
df_director_names['Height ft'] = ""
df_director_names['Height m'] = ""
df_director_names['Birth City'] = ""
df_director_names['Birth State'] = ""
df_director_names['Birth Country'] = ""
df_director_names['Headshot'] = ""
df_director_names['Biography'] = ""
df_director_names

df_director_names.to_csv("director_names_analyze.csv", encoding='utf-8-sig')

for i in range (0, 311): #df_director_names.itertuples():
    if df_director_names['Person_ID'][i] == 0:
        continue
    # if i % 100 == 0:
    #     print(f"{df_director_names['Person_ID'][i]} and  {df_director_names['Name'][i]}") 
    #     time.sleep(180) # 3 minutes
    director = ia.get_person(str(df_director_names['Person_ID'][i]).zfill(7)) 

    # person_height = director.get('height', "")
    # print(person_height)
    # if person_height == "":
    #     height_m = height_ft = None
    # else:    
    #     height_ft = re.search('[0-9\']*\s[0-9]*.?', str(person_height))
    #     height_ft = height_ft.group(0)
    #     height_m = re.search('\([0-9]+\.?[0-9]*', str(person_height))
    #     height_m = str(height_m.group(0)).replace('(','')
    
    birth_list = director.get('birth info', None)
    if birth_list is None:
        birth_city = birth_state = birth_country = None    
    elif len((birth_list['birth place']).split(", ")) == 4:
        birth_city = None
        birth_city, city, birth_state, birth_country = str(birth_list['birth place']).split(", ") 
    elif len((birth_list['birth place']).split(", ")) == 2:
        birth_city = None
        birth_state, birth_country = str(birth_list['birth place']).split(", ")
    elif len((birth_list['birth place']).split(", ")) == 1:
        birth_city = birth_state = None
        birth_country = str(birth_list['birth place'])
    else:
        birth_city, birth_state, birth_country = str(birth_list['birth place']).split(", ")
        
            #birth_city = birth_state = birth_country = "Error"
    
    # df_director_names.loc[i, 'Birth Date']  = director.get('birth date', None)
    # df_director_names.loc[i, 'Height ft']   = height_ft
    # df_director_names.loc[i, 'Height m']    = height_m
    df_director_names.loc[i, 'Birth City']       = birth_city
    df_director_names.loc[i, 'Birth State']      =  birth_state
    df_director_names.loc[i, 'Birth Country']   =  birth_country
    # df_director_names.loc[i, 'Headshot']   =  director.get('headshot', None)
    # try:
    #     df_director_names.loc[i, 'Biography']   =  director.get('biography', None)
    # except:
    #     df_director_names.loc[i, 'Biography'] = "Error"
    #     print("test")