Preperation

Imports

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import bar_chart_race as bcr
import reportlab as rpl
import matplotlib.font_manager as fm
import matplotlib
import matplotlib.ticker as mtick
from math import pi
sns.set_context("talk")



#!curl -O https://github.com/Phonbopit/sarabun-webfont/raw/master/fonts/thsarabunnew-webfont.ttf
#fm.fontManager.ttflist += fm.fontManager.addfont('Poppins-Bold.ttf')
matplotlib.rc('font', family='Poppins', weight='medium')

Read and Inspect Dataset

file = "ViewingActivity.csv"
profile = "Jalal" #Some Profile Names have space at end, watch out for that.
export_type=".jpg"
nflx = pd.read_csv(file)
if len(profile)>0:
    nflx = nflx[nflx["Profile Name"]==profile]

nflx.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1118 entries, 518 to 10287
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype
---  ------                   --------------  -----
 0   Profile Name             1118 non-null   object
 1   Start Time               1118 non-null   object
 2   Duration                 1118 non-null   object
 3   Attributes               644 non-null    object
 4   Title                    1118 non-null   object
 5   Supplemental Video Type  181 non-null    object
 6   Device Type              1118 non-null   object
 7   Bookmark                 1118 non-null   object
 8   Latest Bookmark          1118 non-null   object
 9   Country                  1118 non-null   object
dtypes: object(10)
memory usage: 96.1+ KB

Prepare Dataset

#Convert Start Time to DateTime
nflx['Start Time'] = pd.to_datetime(nflx['Start Time'])

#Further Split up Time
#nflx['st_year'] = nflx['Start Time'].dt.year
#nflx['st_month'] = nflx['Start Time'].dt.month
#nflx['st_day'] = nflx['Start Time'].dt.day
#nflx['st_hour'] = nflx['Start Time'].dt.hour
#nflx['st_min'] = nflx['Start Time'].dt.minute
nflx['date'] = nflx['Start Time'].dt.date
#nflx['time'] = nflx['Start Time'].dt.time

#Convert Duration into minutes
nflx['dur_min'] = nflx['Duration'].str.split(':').apply(lambda x: int(x[0]) * 60 + int(x[1]))

#Split up Title
series = []
season = []
episode = []
movie = []

#Creating Columns for Series, Season, Episode and for whether or not an entry is a movie.
#This is more of an approximation, there are a couple of series which are not
#in the format "Series: Season: Episode". Those would be classed as movies.

for index, row in nflx.iterrows():
    split = row[4].split(":")
    if len(split) >= 3:
        series.append(split[0])
        season.append(split[1])
        episode.append(split[2])
        movie.append(False)
    else:
        series.append(None)
        season.append(None)
        episode.append(None)
        movie.append(True)

nflx["series"] = series
nflx["season"] = season
nflx["episode"] = episode
nflx["movie"] = movie

#Creating Weekday Column
wdays = {0:"Monday", 1:"Tuesday",2:"Wednesday",3:"Thursday",4:"Friday",5:"Saturday",6:"Sunday"}
nflx['weekday'] = nflx['Start Time'].apply(lambda x: str(wdays[x.weekday()]))
#Checking the new DataFrame
nflx.info()
nflx.head(1)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1118 entries, 518 to 10287
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   Profile Name             1118 non-null   object        
 1   Start Time               1118 non-null   datetime64[ns]
 2   Duration                 1118 non-null   object        
 3   Attributes               644 non-null    object        
 4   Title                    1118 non-null   object        
 5   Supplemental Video Type  181 non-null    object        
 6   Device Type              1118 non-null   object        
 7   Bookmark                 1118 non-null   object        
 8   Latest Bookmark          1118 non-null   object        
 9   Country                  1118 non-null   object        
 10  date                     1118 non-null   object        
 11  dur_min                  1118 non-null   int64         
 12  series                   757 non-null    object        
 13  season                   757 non-null    object        
 14  episode                  757 non-null    object        
 15  movie                    1118 non-null   bool          
 16  weekday                  1118 non-null   object        
dtypes: bool(1), datetime64[ns](1), int64(1), object(14)
memory usage: 149.6+ KB
Profile Name Start Time Duration Attributes Title Supplemental Video Type Device Type Bookmark Latest Bookmark Country date dur_min series season episode movie weekday
518 Jalal 2020-06-08 18:07:40 00:00:08 Autoplayed: user action: None; 365 Days_hook_primary_16x9 HOOK Edge (Cadmium) 00:00:08 00:00:08 DE (Germany) 2020-06-08 0 None None None True Monday

Total Watchtime

#Total Watchtime
total_watchtime = nflx['dur_min'].sum()
print("Total Minutes: " + str(total_watchtime))
print("= Hours: " + str(total_watchtime/60))
print("= Days: " + str(total_watchtime/60/24))
print("= Weeks: " + str(total_watchtime/60/24/7))
Total Minutes: 23539
= Hours: 392.31666666666666
= Days: 16.346527777777776
= Weeks: 2.3352182539682538
#Def Plot Style
plt.style.use("dark_background")

def netflix_style(ax):
    """Applying white style and weighted fonts to axes and labels"""
    ax.spines['bottom'].set_color("white")
    ax.spines['top'].set_color("white")
    ax.spines['left'].set_color("white")
    ax.spines['right'].set_color("white")
    ax.tick_params(axis='x', colors='white')
    ax.tick_params(axis='y', colors='white')
    ax.yaxis.label.set_color('white')
    ax.yaxis.label.set_weight("bold")
    ax.xaxis.label.set_weight("bold")
    ax.xaxis.label.set_color('white')
    ax.title.set_color('white')
    ax.title.set_weight('bold')
    return ax

Plot Total Watchtime

Per Week

#Plot Watchtime Throughout Record

#Resample Dataset -> Weekly.
df_resample = nflx.resample('W', on="Start Time").sum().reset_index()
df_resample['dur_hrs'] = df_resample['dur_min']/60

x=df_resample['Start Time']
y=df_resample['dur_hrs']

print(x.head())

#Plot Hours Watched per Week
g = sns.relplot(y='dur_hrs', x="Start Time", data=df_resample, kind="line", aspect=3, color="#E50914")
g.set_xlabels("Time")
g.set_ylabels("Hours watched (per week)")

ax1 = g.axes[0][0] #Getting Axes

ax1 = netflix_style(ax1) #Applying Netflix style
ax1.set_ylim(0,) #Setting Y-Min = 0

#Plot Mean Line
ax1.axhline(df_resample['dur_hrs'].mean(),ls="--", label="Mean ("+str(round(df_resample['dur_hrs'].mean(),2))+")", color="white")

#Plot Max Arrow
ymax = max(y) #YValue of Max
xpos = df_resample[df_resample['dur_hrs'] == ymax].index #Index position of Max
xmax = x[xpos] #Finding out XValue of Max

xposition = xmax-datetime.timedelta(weeks=len(df_resample['Start Time'])/4) #putting Arrow-Endpoint
if xposition.item() <= df_resample['Start Time'].head(1).item():
    xposition = xmax+datetime.timedelta(weeks=len(df_resample['Start Time'])/8) #If Arrow-Endpoint smaller than x, put endpoint on other side of peak

yposition = ymax-2
if yposition < 1:
    yposition = ymax

week_start = xmax.item()-datetime.timedelta(days=7)
week_end = xmax.item()
weekstring = week_start.strftime("%d.%m.%y")+" - "+week_end.strftime("%d.%m.%y") #String showing the week

ax1.annotate('Max: '+str(round(ymax,2))+"hrs \n "+weekstring, xy=(xmax, ymax), xytext=(xposition, yposition),
            arrowprops=dict(arrowstyle='->')) #adding the arrow

plt.title("Netflix Watchtime ("+profile.replace(' ','')+") by Week", y=1.08, weight="bold")
#Shaded Area
l1 = ax1.lines[0]
x1 = l1.get_xydata()[:,0]
y1 = l1.get_xydata()[:,1]
ax1.fill_between(x1,y1, color="red", alpha=0.3)


#Save File
g.fig.savefig(profile.replace(' ','')+"_watchtime_year"+export_type,bbox_inches = "tight")


0   2017-11-05
1   2017-11-12
2   2017-11-19
3   2017-11-26
4   2017-12-03
Name: Start Time, dtype: datetime64[ns]

alt

Series contribution to Best Week

#Series BarPlot Top-Week
display(week_start)
display(week_end)


#Define Color Palette for Pie
palette=['#AA0000','#E50914', '#FF3D2D', '#FF5F46', '#FF7F60', '#FF9D7B']

#Create 'series_topweek' DataFrame that contains data for the week in question (This doesn't yield the same values in total as the other, unsure why.)
series_topweek = nflx[(nflx['Start Time'] >= week_start) & (nflx['Start Time'] <= week_end)].groupby(["series"]).agg({"dur_min":"sum","Start Time":"last"}).reset_index()
series_topweek["dur_hrs"] = series_topweek['dur_min'].apply(lambda x: x/60)
series_topweek = series_topweek.sort_values(by="dur_hrs", ascending=False).head(20)
display(series_topweek['series'])

#Create Lists for Pie Plot
pie_labels = series_topweek['series'].tolist()
pie_values = series_topweek["dur_min"].tolist()

#Display Pie Plot
fig, ax = plt.subplots()
autotexts = ax.pie(pie_values, labels=pie_labels, autopct='%1.1f%%',
        shadow=False, startangle=90, colors=palette)


ax.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title("% of Watchtime Contributed by Series in Top-Week", y=1.08, weight="bold")
plt.show()

fig.savefig(profile.replace(' ','')+"_series_topweek"+export_type,bbox_inches = "tight")
Timestamp('2017-11-05 00:00:00')



Timestamp('2017-11-12 00:00:00')



0    The Walking Dead
Name: series, dtype: object

alt

% of Movies Watched

#% Watched Movies

#Calculating Watchtimes for Movies and Series
movie_watchtime = nflx[nflx['movie']]['dur_min'].sum()

series_watchtime = nflx[nflx['movie']==False]['dur_min'].sum()

fig, ax = plt.subplots()
autotexts = ax.pie([movie_watchtime, series_watchtime], labels=['Movies \n(' + str(round(movie_watchtime/60))+'hrs)', 'Series \n(' + str(round(series_watchtime/60))+'hrs)'], autopct='%1.1f%%',
        shadow=False, startangle=90, colors=["white","#E50914"])

autotexts[2][0].set_color("black")

ax.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title("Proportion of Time Spent Watching Movies vs. Series", y=1.08, weight="bold")
plt.show()

fig.savefig(profile.replace(' ','')+"_movie_series"+export_type,bbox_inches = "tight")

alt

Favourite Series

#Series BarPlot

#Define Color Palette for Hue
palette=['#AA0000','#E50914', '#FF3D2D', '#FF5F46', '#FF7F60', '#FF9D7B']

#Create 'Series' DataFrame
series = nflx[nflx['movie']==False].groupby("series").agg({"dur_min":"sum","Start Time":"last"}).reset_index()
series["dur_hrs"] = series['dur_min'].apply(lambda x: x/60)
series = series.sort_values(by="dur_hrs", ascending=False).head(20)

#Create Column 'Start Year'for Colouring in the Bars
series['Start Year'] = series['Start Time'].dt.year

#Create Plot
g = sns.catplot(kind = "bar", x='series', y='dur_hrs', data=series, aspect=3, palette=palette, hue="Start Year", dodge=False)
g.set_ylabels("Hours Watched")
g.set_xlabels("Series")
g.set_xticklabels(rotation=90)
g._legend.remove()
ax1 = g.axes[0][0] #Getting Axes for the Plot

ax1 = netflix_style(ax1) #Applying Netflix Style to Axis

#Draw Mean Line
ax1.axhline(series['dur_hrs'].mean(),ls="--", label="Mean ("+str(series['dur_hrs'].mean().round(2))+")",color="white")

ax1.legend()
plt.title("Top 20 Series by Watchtime (Coloured by year first watched)", y=1.08, weight="bold")

#Save Plot
g.savefig(profile.replace(' ','')+"_top20_series"+export_type,bbox_inches = "tight")

alt

Watchtime per Weekday

#Weekday BarPlot

order = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']

weekdays = []
total = []
proportion = []
for key in wdays:
    total_for_weekday = nflx[nflx['weekday']==wdays[key]]['dur_min'].sum()
    weekdays.append(wdays[key])
    total.append(total_for_weekday)
    proportion.append((total_for_weekday/total_watchtime)*100)

df_wd = pd.DataFrame({'weekday':weekdays, 'dur_min':total, 'proportion':proportion})

print(df_wd)

g = sns.catplot(kind = "point", x='weekday', y='proportion', ci=None, order=order, data=df_wd, aspect=1.6, color="#E50914")
g.set_ylabels("% of total time watched (avg)")
g.set_xlabels("Weekday")
g.set_xticklabels(rotation=90)

ax1 = g.axes[0][0]
ax1 = netflix_style(ax1)
ax1.yaxis.set_major_formatter(mtick.PercentFormatter())
ax1.axhline(100/7, ls="--", label="Expected Value")

ax1.legend()


plt.title("% of Time Watched per Weekday ("+profile.replace(' ','')+")", y=1.08, weight="bold")
g.savefig(profile.replace(' ','')+"_weekday"+export_type,bbox_inches = "tight")


     weekday  dur_min  proportion
0     Monday     5085   12.765797
1    Tuesday     4189   10.516406
2  Wednesday     4814   12.085457
3   Thursday     3742    9.394221
4     Friday     6723   16.877966
5   Saturday     7100   17.824417
6     Sunday     8180   20.535737

alt

Watchtime per Hour

Barplot

#Hour BarPlot

dummy = []
total = []
proportion = []
for hour in range(0,24):
    total_for_hour = nflx[nflx['Start Time'].dt.hour==hour]['dur_min'].sum()
    total.append(total_for_hour)
    proportion.append((total_for_hour/total_watchtime)*100)
    dummy.append(0)

df_hd = pd.DataFrame({'dur_min':total, 'proportion':proportion, 'dummy':dummy}).reset_index()
df_hd.columns = ['hour','dur_min','proportion','dummy']

g = sns.catplot(kind = "point", x='hour', y='proportion', ci=None, data=df_hd, aspect=1.6, color="#E50914")
g.set_ylabels("% of total time watched (avg)")
g.set_xlabels("Hour")
g.set_xticklabels(rotation=0)

ax1 = g.axes[0][0]
ax1 = netflix_style(ax1)
ax1.yaxis.set_major_formatter(mtick.PercentFormatter())
ax1.axhline(100/24, ls="--", label="Expected Value")
ax1.legend()

plt.title("% of Watchtime per Hour ("+profile.replace(' ','')+")", y=1.08, weight="bold")

g.savefig(profile.replace(' ','')+"_hour"+export_type,bbox_inches = "tight")

alt

Radarplot

#RadarPlot for Hour Distribution
#Source: https://python-graph-gallery.com/392-use-faceting-for-radar-chart/
# Set data

df_hd_pv = df_hd.pivot_table(index='dummy', columns='hour', values="proportion")

# number of variable
categories=list(df_hd_pv)
N = len(categories)

categories

# We are going to plot the first line of the data frame.
# But we need to repeat the first value to close the circular graph:
values=df_hd_pv.loc[0].values.flatten().tolist()
values += values[:1]

# What will be the angle of each axis in the plot? (we divide the plot / number of variable)
angles = [n / float(N) * 2 * pi for n in range(N)]
angles += angles[:1]


# Initialise the spider plot
ax = plt.subplot(111, polar=True)
ax.set_theta_offset(pi / 2)
ax.set_theta_direction(-1)

# Draw one axe per variable + add labels labels yet
plt.xticks(angles[:-1], categories, color='white', size=12)

largest = df_hd.nlargest(1, columns="proportion").iloc[0,2]

ytick_label = [str(step)+"%" for step in range(0,int(largest)+1,int(largest/3))]

# Draw ylabels
ax.set_rlabel_position(0)
plt.yticks(range(0,int(largest)+1,int(largest/3)), ytick_label, color="white", size=10)
plt.ylim(0,largest+0.5)


plt.title("% of Overall Watchtime per Hour", color="white", weight="bold", y=1.15)

# Plot data
ax.plot(angles, values, linewidth=1, linestyle='solid', color=palette[1])

# Fill area
ax.fill(angles, values, 'b', alpha=0.8,color=palette[1])

plt.savefig(profile.replace(' ','')+"_hour_radar"+export_type, bbox_inches = "tight")

alt