In [8]:
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('netflix_titles.csv')
In [9]:
# Q1: How many titles are there per content type (Movie vs TV Show)?
df.groupby('type')['title'].count()
Out[9]:
type Movie 6131 TV Show 2676 Name: title, dtype: int64
In [10]:
# Q2: What is the number of titles released each year?
df.groupby('release_year')['title'].count().sort_index(ascending=False).head(10)
Out[10]:
release_year 2021 592 2020 953 2019 1030 2018 1147 2017 1032 2016 902 2015 560 2014 352 2013 288 2012 237 Name: title, dtype: int64
In [11]:
# Q3: Which countries produce the most content on Netflix?
df.groupby('country')['title'].count().sort_values(ascending=False).head(10)
Out[11]:
country United States 2818 India 972 United Kingdom 419 Japan 245 South Korea 199 Canada 181 Spain 145 France 124 Mexico 110 Egypt 106 Name: title, dtype: int64
In [12]:
# Q4: What is the number of titles per rating?
df.groupby('rating')['title'].count().sort_values(ascending=False)
Out[12]:
rating TV-MA 3207 TV-14 2160 TV-PG 863 R 799 PG-13 490 TV-Y7 334 TV-Y 307 PG 287 TV-G 220 NR 80 G 41 TV-Y7-FV 6 UR 3 NC-17 3 74 min 1 84 min 1 66 min 1 Name: title, dtype: int64
In [ ]:
# Q5: How many titles were added to Netflix each month of the year?
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
df['year_added'] = df['date_added'].dt.year.astype('Int64')
df['month_added'] = df['date_added'].dt.month_name()
df.groupby(['year_added', 'month_added'])['title'].count().sort_index(ascending=False).reset_index().head(20)
Out[ ]:
| year_added | month_added | title | |
|---|---|---|---|
| 0 | 2021 | September | 183 |
| 1 | 2021 | May | 132 |
| 2 | 2021 | March | 112 |
| 3 | 2021 | June | 207 |
| 4 | 2021 | July | 257 |
| 5 | 2021 | January | 132 |
| 6 | 2021 | February | 109 |
| 7 | 2021 | August | 178 |
| 8 | 2021 | April | 188 |
| 9 | 2020 | September | 168 |
| 10 | 2020 | October | 167 |
| 11 | 2020 | November | 154 |
| 12 | 2020 | May | 157 |
| 13 | 2020 | March | 137 |
| 14 | 2020 | June | 156 |
| 15 | 2020 | July | 146 |
| 16 | 2020 | January | 204 |
| 17 | 2020 | February | 114 |
| 18 | 2020 | December | 169 |
| 19 | 2020 | August | 129 |
In [30]:
# Q6: Which directors have the most titles on Netflix?
df.groupby('director')['title'].count().sort_values(ascending=False).dropna().head(10)
Out[30]:
director Rajiv Chilaka 19 Raúl Campos, Jan Suter 18 Suhas Kadav 16 Marcus Raboy 16 Jay Karas 14 Cathy Garcia-Molina 13 Jay Chapman 12 Youssef Chahine 12 Martin Scorsese 12 Steven Spielberg 11 Name: title, dtype: int64
In [31]:
# Q7: What are the top 5 most common genres or categories?
df_exploded = df.assign(genres=df['listed_in'].str.split(', ')).explode('genres')
df_exploded['genres'].value_counts().head(5)
Out[31]:
genres International Movies 2752 Dramas 2427 Comedies 1674 International TV Shows 1351 Documentaries 869 Name: count, dtype: int64