import pandas as pd
import re
import imdb
import time

ia = imdb.Cinemagoer()
df_director_films = pd.read_csv('Director_Movies_Clean.csv', encoding= 'utf-8-sig')
df_director_names = pd.read_csv('director_names.csv', encoding='unicode_escape')
kinds = {}
df_director_films

Movie_IDTitleYearPerson_IDName
03501074Madam Secretary (2015-2017)0151Morgan Freeman
1106464Bopha!1993151Morgan Freeman
2995854A Journal for Jordan2021243Denzel Washington
32671706Fences2016243Denzel Washington
4413573Grey's Anatomy2016243Denzel Washington
..................
66068571700Remembrance: A Portrait Study19679913311Edward Owens
660712127402Autrefois J'ai aime une femme19669913311Edward Owens
660812666216Sisters of the Screen - African Women in Cinema196611720380Beti Ellerson
660912666216Sisters of the Screen - African Women in Cinema200211720380Beti Ellerson
661010038432A Luta Continua197210555174Robert Van Lierop

6611 rows × 5 columns

Removed all Duplicates from Dataframe

df_movies = df_director_films.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
df_movies

Movie_IDTitleYearPerson_IDName
03501074Madam Secretary (2015-2017)0151Morgan Freeman
1106464Bopha!1993151Morgan Freeman
2995854A Journal for Jordan2021243Denzel Washington
32671706Fences2016243Denzel Washington
4413573Grey's Anatomy2016243Denzel Washington
..................
66068571700Remembrance: A Portrait Study19679913311Edward Owens
660712127402Autrefois J'ai aime une femme19669913311Edward Owens
660812666216Sisters of the Screen - African Women in Cinema196611720380Beti Ellerson
660912666216Sisters of the Screen - African Women in Cinema200211720380Beti Ellerson
661010038432A Luta Continua197210555174Robert Van Lierop

6316 rows × 5 columns

Remove all the movies with no year (possibly television shows)

df_movies = df_movies[df_movies.Year != 0]
df_movies.describe()

Movie_IDYearPerson_ID
count5.699000e+035699.0000005.699000e+03
mean4.781896e+061998.6564319.356992e+05
std4.761429e+0619.8594451.235733e+06
min1.187000e+041919.0000001.510000e+02
25%3.851770e+051991.0000002.288530e+05
50%3.223890e+062004.0000006.085750e+05
75%8.135978e+062014.0000009.373060e+05
max2.107769e+072027.0000001.172038e+07
df_movies.loc[df_movies['Year'] == 2021]

Movie_IDTitleYearPerson_IDName
2995854A Journal for Jordan2021243Denzel Washington
1914324128NYC Epicenters 9/11-2021½2021490Spike Lee
18513935770Chris Rock Total Blackout: The Tamborine Exten...20211674Chris Rock
26710608454Harlem20212700Malcolm D. Lee
2683554046Space Jam: A New Legacy20212700Malcolm D. Lee
..................
648712223626Black Seeds: The History of Africans in America20215679045Bayer Mack
65298537542Cold Blood20216462306Taylor Ri'chard
654113814666Pursued20217280958Fathia Absie
655215596914Voodoo20217385237Phillip Youmans
660411753636Private Imaginings and Narrative Facts20219913311Edward Owens

173 rows × 5 columns

df_movies.plot(x = 'Title', y='Year', kind = 'scatter')

AxesSubplot

import matplotlib.pyplot as plt
import seaborn as sns

sns.histplot(df_movies.Year, kde=True)
plt.show()

AxesSubplot

df_movies['Year'].describe()
count    5699.000000
mean     1998.656431
std        19.859445
min      1919.000000
25%      1991.000000
50%      2004.000000
75%      2014.000000
max      2027.000000
Name: Year, dtype: float64
def df_director_id(name: str):
    for director in df_movies.itertuples():
        if director.Name == name:
            return (str(director.Person_ID).zfill(7))
def movie_format(movie_id):
    if movie_id == None:
        return None
    
    if len(movie_id) > 1:
        return ', '.join(map(str, movie_id))
    else:
        return ' '.join(map(str, movie_id))


def movie_metascore(movie_id):
    return movie_id['metascore']


def movie_budget(movie_id):
    box_off = movie_id.get('box office', None)
    if box_off == None:
        return None
    else:
        budget = re.search('[0-9,]*\d', str(movie_id['box office']))
            
    return  budget.group(0)
    
#tstmovie = ia.get_movie('13972246', info=['main', 'critic_reviews', 'vote details', 'plot'])
#print(tstmovie['box office'])
#budget = re.search('[0-9,]*\d', '$1') #str(tstmovie['box office']))
#print(budget)
#print(movie_budget(tstmovie))
# print(movie_metascore(tstmovie))
df_movies['Type'] = ""
df_movies['Genres'] = ""
df_movies['Metascore'] = ""
df_movies['Budget'] = ""
df_movies['Votes'] = ""
df_movies['Rating'] = ""
df_movies['Runtime'] = ""
df_movies
df_movies = df_movies.reset_index()
df_movies.head(20)
#x = 0
for i in range (6049, 6316): #df_movies.itertuples():
    if i % 150 == 0:
        print(f"{df_movies['Movie_ID'][i]} and  {df_movies['Title'][i]}") 
        time.sleep(180) # 3 minutes
    movie = ia.get_movie(df_movies['Movie_ID'][i], info=['main', 'critic_reviews', 'vote details', 'plot']) #ia.get_movie(df_movies['Movie_ID'][i])
    df_movies.loc[i, 'Type']        = movie.get('kind', None)
    df_movies.loc[i, 'Runtime']     = movie_format(movie.get('runtime', None)) #movie.get('runtime', None)
    df_movies.loc[i, 'Rating']      = movie.get('rating', None)
    df_movies.loc[i, 'Votes']       = movie.get('votes', None)
    df_movies.loc[i, 'Genres']      = movie_format(movie.get('genres', None)) #str(movie.get('genres', None))
    df_movies.loc[i, 'Metascore']   = movie.get('metascore', None)
    df_movies.loc[i, 'Budget']      = movie_budget(movie)
    #x +=1

    #df_movies

    # if x == 10:
    #     break
#df_movies.head(10)        
9847438 and  Football Town: Cleveland Ohio
6340186 and  Film 13
df_movies.to_csv("Director_Movies_Analyze.csv", encoding='utf-8-sig')
for Movie_ID in df_movies.columns[1:]:
    print(df_movies[Movie_ID])
spike_lee = ia.get_person('0000490')
sidney_poitier = ia.get_person('0001627')
tvshow = ia.get_movie('209093', info='critic_reviews')
print(f"{spike_lee['birth date']}  {spike_lee['height']} {spike_lee['birth info']} ")
print(f"{sidney_poitier['birth date']}  {sidney_poitier['height']} {sidney_poitier['birth info']} ")
1957-03-20  5' 7" (1.7 m) {'birth place': 'Atlanta, Georgia, USA'} 
1927-02-20  6' 2½" (1.89 m) {'birth place': 'Miami, Florida, USA'} 
tvshow.infoset2keys
{'critic_reviews': []}
mv = ia.get_movie('790770', info=['main', 'critic_reviews', 'vote details', 'plot'])
#metascore = ia.get_movie_critic_reviews('7349662')
def person_height_ft(person_id):
    person_height = person_id.get('height', None)
    print(person_height)
    if person_height == None:
        return None
    else:
        meter = re.search('[0-9\']+\s[0-9]+.?', str(person_height))
        height = re.search('\([0-9]+\.?[0-9]*', str(person_height))
        height = str(height.group(0)).replace('(','')
    return  meter.group(0)

tstperson = ia.get_person('0002700')
#str(director.Person_ID).zfill(7)
#print(tstmovie['box office'])
#budget = re.search('[0-9,]*\d', '$1') #str(tstmovie['box office']))
#print(budget)
birth_list = tstperson.get('birth info', None)
#city, state, country = str(birth_list['birth place']).split(", ")

#city, state, country = birth_list.split(',')
#print(tstperson['mini biography'])
#print(tstperson.get('height', None))
print(birth_list)
#print(f"{city} adfsfd {state} aerrod {country}")
#tstperson['birth info']
# print(movie_metascore(tstmovie))
{'birth place': 'USA'}
tstperson.infoset2keys
{'main': ['birth info', 'headshot', 'filmography', 'imdbID', 'name'],
 'biography': ['headshot',
  'birth name',
  'height',
  'mini biography',
  'trade mark',
  'trivia',
  'quotes',
  'birth date',
  'birth notes']}
df_director_names = pd.read_csv("director_names_analyze.csv", index_col=[0], encoding="utf-8-sig")
for i in range (67, 311): #df_director_names.itertuples():
    if df_director_names['Person_ID'][i] == 0:
        continue
    if i % 100 == 0:
        print(f"{df_director_names['Person_ID'][i]} and  {df_director_names['Name'][i]}") 
        time.sleep(180) # 3 minutes
    director = ia.get_person(str(df_director_names['Person_ID'][i]).zfill(7)) 

    person_height = director.get('height', "")
    print(person_height)
    if person_height == "":
        height_m = height_ft = None
    else:    
        height_ft = re.search('[0-9\']*\s[0-9]*.?', str(person_height))
        height_ft = height_ft.group(0)
        height_m = re.search('\([0-9]+\.?[0-9]*', str(person_height))
        height_m = str(height_m.group(0)).replace('(','')
    
    birth_list = tstperson.get('birth info', "")
    if birth_list == "":
        birth_city = birth_state = birth_country = None
    else:
        birth_city, birth_state, birth_country = str(birth_list['birth place']).split(", ")
    
    df_director_names.loc[i, 'Birth Date']  = director.get('birth date', None)
    df_director_names.loc[i, 'Height ft']   = height_ft
    df_director_names.loc[i, 'Height m']    = height_m
    df_director_names.loc[i, 'Birth City']       = birth_city
    df_director_names.loc[i, 'Birth State']      =  birth_state
    df_director_names.loc[i, 'Birth Country']   =  birth_country
    df_director_names.loc[i, 'Headshot']   =  director.get('headshot', None)
    try:
        df_director_names.loc[i, 'Biography']   =  director.get('biography', None)
    except:
        df_director_names.loc[i, 'Biography'] = "Error"
        print("test")
df_director_names['Birth Date'] = ""
df_director_names['Height ft'] = ""
df_director_names['Height m'] = ""
df_director_names['Birth City'] = ""
df_director_names['Birth State'] = ""
df_director_names['Birth Country'] = ""
df_director_names['Headshot'] = ""
df_director_names['Biography'] = ""
df_director_names
df_director_names.to_csv("director_names_analyze.csv", encoding='utf-8-sig')
for i in range (0, 311): #df_director_names.itertuples():
    if df_director_names['Person_ID'][i] == 0:
        continue
    # if i % 100 == 0:
    #     print(f"{df_director_names['Person_ID'][i]} and  {df_director_names['Name'][i]}") 
    #     time.sleep(180) # 3 minutes
    director = ia.get_person(str(df_director_names['Person_ID'][i]).zfill(7)) 

    # person_height = director.get('height', "")
    # print(person_height)
    # if person_height == "":
    #     height_m = height_ft = None
    # else:    
    #     height_ft = re.search('[0-9\']*\s[0-9]*.?', str(person_height))
    #     height_ft = height_ft.group(0)
    #     height_m = re.search('\([0-9]+\.?[0-9]*', str(person_height))
    #     height_m = str(height_m.group(0)).replace('(','')
    
    birth_list = director.get('birth info', None)
    if birth_list is None:
        birth_city = birth_state = birth_country = None    
    elif len((birth_list['birth place']).split(", ")) == 4:
        birth_city = None
        birth_city, city, birth_state, birth_country = str(birth_list['birth place']).split(", ") 
    elif len((birth_list['birth place']).split(", ")) == 2:
        birth_city = None
        birth_state, birth_country = str(birth_list['birth place']).split(", ")
    elif len((birth_list['birth place']).split(", ")) == 1:
        birth_city = birth_state = None
        birth_country = str(birth_list['birth place'])
    else:
        birth_city, birth_state, birth_country = str(birth_list['birth place']).split(", ")
        
            #birth_city = birth_state = birth_country = "Error"
    
    # df_director_names.loc[i, 'Birth Date']  = director.get('birth date', None)
    # df_director_names.loc[i, 'Height ft']   = height_ft
    # df_director_names.loc[i, 'Height m']    = height_m
    df_director_names.loc[i, 'Birth City']       = birth_city
    df_director_names.loc[i, 'Birth State']      =  birth_state
    df_director_names.loc[i, 'Birth Country']   =  birth_country
    # df_director_names.loc[i, 'Headshot']   =  director.get('headshot', None)
    # try:
    #     df_director_names.loc[i, 'Biography']   =  director.get('biography', None)
    # except:
    #     df_director_names.loc[i, 'Biography'] = "Error"
    #     print("test")