Game Ranking - Webscraping with Python

by Jessica Chan

In [1]:
import requests
from bs4 import BeautifulSoup #Web scrapping Lib
import pandas as pd #Dataset lib
import matplotlib.pyplot as plt #Chart Visualization
import numpy as np
In [2]:
source_url = "https://newzoo.com/insights/rankings/top-20-core-pc-games/"
r = requests.get(source_url)
In [3]:
r.status_code #200 means OK
Out[3]:
200
In [4]:
len(r.content) #length of content
Out[4]:
54007
In [5]:
df_table = pd.read_html(source_url,index_col=None)
df=df_table[0]

Displaying the entire table from the web without filtering.

In [6]:
df
Out[6]:
Image Rank Game Title Publisher Change
0 NaN 1 League of Legends Riot Games -
1 NaN 2 Hearthstone: Heroes of Warcraft Blizzard Entertainment 2
2 NaN 3 PLAYERUNKNOWN'S BATTLEGROUNDS Bluehole Studio -
3 NaN 4 Fortnite Epic Games 2
4 NaN 5 Counter-Strike: Global Offensive Valve Corporation -
5 NaN 6 Minecraft Mojang 1
6 NaN 7 Overwatch Blizzard Entertainment 1
7 NaN 8 Tom Clancy's Rainbow Six: Siege Ubisoft Entertainment -
8 NaN 9 World of Warcraft Blizzard Entertainment -
9 NaN 10 Grand Theft Auto V Rockstar Games 1
10 NaN 11 Dota 2 Valve Corporation 1
11 NaN 12 Realm Royale Hi-Rez Studios New!
12 NaN 13 Garry's Mod Valve Corporation 1
13 NaN 14 Heroes of the Storm Blizzard Entertainment 1
14 NaN 15 Rocket League Psyonix 2
15 NaN 16 World Of Tanks Wargaming.net 4
16 NaN 17 Diablo III Blizzard Entertainment 9
17 NaN 18 Warframe Digital Extremes 2
18 NaN 19 Fallout 4 Bethesda Softworks New!
19 NaN 20 The Elder Scrolls V: Skyrim Bethesda Softworks 2
In [7]:
df.loc[0] #0 being the index
Out[7]:
Image                       NaN
Rank                          1
Game Title    League of Legends
Publisher            Riot Games
Change                        -
Name: 0, dtype: object
In [8]:
df.columns
Out[8]:
Index(['Image', 'Rank', 'Game Title', 'Publisher', 'Change'], dtype='object')
In [9]:
len(df)
Out[9]:
20
In [10]:
df.shape #number of rows, columns
Out[10]:
(20, 5)
In [11]:
df['Publisher'].unique() #unique publisher
Out[11]:
array(['Riot Games', 'Blizzard Entertainment', 'Bluehole Studio',
       'Epic Games', 'Valve Corporation', 'Mojang',
       'Ubisoft Entertainment', 'Rockstar Games', 'Hi-Rez Studios',
       'Psyonix', 'Wargaming.net', 'Digital Extremes',
       'Bethesda Softworks'], dtype=object)
In [12]:
df['Publisher'].nunique()
Out[12]:
13
In [13]:
df.dtypes
Out[13]:
Image         float64
Rank            int64
Game Title     object
Publisher      object
Change         object
dtype: object
In [14]:
df.Publisher.head(3) #first 3 of publisher column
Out[14]:
0                Riot Games
1    Blizzard Entertainment
2           Bluehole Studio
Name: Publisher, dtype: object
In [15]:
df2 = df.iloc[:,1:] #Access all rows,Column 1 to the end 
df2
Out[15]:
Rank Game Title Publisher Change
0 1 League of Legends Riot Games -
1 2 Hearthstone: Heroes of Warcraft Blizzard Entertainment 2
2 3 PLAYERUNKNOWN'S BATTLEGROUNDS Bluehole Studio -
3 4 Fortnite Epic Games 2
4 5 Counter-Strike: Global Offensive Valve Corporation -
5 6 Minecraft Mojang 1
6 7 Overwatch Blizzard Entertainment 1
7 8 Tom Clancy's Rainbow Six: Siege Ubisoft Entertainment -
8 9 World of Warcraft Blizzard Entertainment -
9 10 Grand Theft Auto V Rockstar Games 1
10 11 Dota 2 Valve Corporation 1
11 12 Realm Royale Hi-Rez Studios New!
12 13 Garry's Mod Valve Corporation 1
13 14 Heroes of the Storm Blizzard Entertainment 1
14 15 Rocket League Psyonix 2
15 16 World Of Tanks Wargaming.net 4
16 17 Diablo III Blizzard Entertainment 9
17 18 Warframe Digital Extremes 2
18 19 Fallout 4 Bethesda Softworks New!
19 20 The Elder Scrolls V: Skyrim Bethesda Softworks 2
In [16]:
df2.dtypes
Out[16]:
Rank           int64
Game Title    object
Publisher     object
Change        object
dtype: object
In [17]:
df2.head(5).iloc[:,:3]
Out[17]:
Rank Game Title Publisher
0 1 League of Legends Riot Games
1 2 Hearthstone: Heroes of Warcraft Blizzard Entertainment
2 3 PLAYERUNKNOWN'S BATTLEGROUNDS Bluehole Studio
3 4 Fortnite Epic Games
4 5 Counter-Strike: Global Offensive Valve Corporation
In [18]:
df['Image'].isnull().any()
Out[18]:
True
In [19]:
df2.isnull().any()
Out[19]:
Rank          False
Game Title    False
Publisher     False
Change        False
dtype: bool
In [20]:
obj = df2.Publisher.unique()
obj
Out[20]:
array(['Riot Games', 'Blizzard Entertainment', 'Bluehole Studio',
       'Epic Games', 'Valve Corporation', 'Mojang',
       'Ubisoft Entertainment', 'Rockstar Games', 'Hi-Rez Studios',
       'Psyonix', 'Wargaming.net', 'Digital Extremes',
       'Bethesda Softworks'], dtype=object)
In [21]:
df2[df2['Publisher']=="Blizzard Entertainment"].count()
Out[21]:
Rank          5
Game Title    5
Publisher     5
Change        5
dtype: int64
In [22]:
grp_df2 = df2['Game Title'].groupby(df2['Publisher'])

grp_df2.describe() 
Out[22]:
count unique top freq
Publisher
Bethesda Softworks 2 2 The Elder Scrolls V: Skyrim 1
Blizzard Entertainment 5 5 Hearthstone: Heroes of Warcraft 1
Bluehole Studio 1 1 PLAYERUNKNOWN'S BATTLEGROUNDS 1
Digital Extremes 1 1 Warframe 1
Epic Games 1 1 Fortnite 1
Hi-Rez Studios 1 1 Realm Royale 1
Mojang 1 1 Minecraft 1
Psyonix 1 1 Rocket League 1
Riot Games 1 1 League of Legends 1
Rockstar Games 1 1 Grand Theft Auto V 1
Ubisoft Entertainment 1 1 Tom Clancy's Rainbow Six: Siege 1
Valve Corporation 3 3 Dota 2 1
Wargaming.net 1 1 World Of Tanks 1
In [23]:
groupCount1 = grp_df2.count().reset_index(name='count').sort_values(['count'], ascending=False)

#groupCount = grp_df2.count()

#gpCount = groupCount1[:]

groupCount1
Out[23]:
Publisher count
1 Blizzard Entertainment 5
11 Valve Corporation 3
0 Bethesda Softworks 2
2 Bluehole Studio 1
3 Digital Extremes 1
4 Epic Games 1
5 Hi-Rez Studios 1
6 Mojang 1
7 Psyonix 1
8 Riot Games 1
9 Rockstar Games 1
10 Ubisoft Entertainment 1
12 Wargaming.net 1
In [24]:
labels=[]
titles_count=[]
for i,(mu_p, mu) in enumerate(zip(groupCount1['Publisher']
    .values, groupCount1['count'].get_values())):
    labels.append(mu_p)
    titles_count.append(int(mu))
titles_count
Out[24]:
[5, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
In [25]:
labels[:3]
Out[25]:
['Blizzard Entertainment', 'Valve Corporation', 'Bethesda Softworks']
In [26]:
plt.barh(labels[:3],titles_count[:3],align='center', alpha=0.5)
#plt.xticks(x_pos, labels)
plt.xlabel('No. of Game Titles')
plt.title('No. of Titles by Publisher with Most Titles')
Out[26]:
Text(0.5,1,'No. of Titles by Publisher with Most Titles')
In [27]:
sizes = titles_count[:3]
colors = ['gold', 'yellowgreen', 'lightcoral']
explode = (0.2, 0, 0)
sizes
Out[27]:
[5, 3, 2]
In [28]:
plt.pie(sizes, explode=explode, labels=labels[:3], colors=colors,autopct='%1.0f%%', shadow=True, startangle=45)
#plt.pie(sizes, explode=explode, labels=labels[:3], colors=colors, autopct='%1.f', shadow=True, startangle=45)

plt.axis('equal')
plt.title('Top 3 Publishers with Most Game Titles')
plt.show()