import requests
import requests_cache
import pandas as pd
import lxml.html as lx
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
requests_cache.install_cache('project_cache')
def allstar_players(recent_years):
"""
Input: recent_years, >= 1
Output: players that appeared at least twice in the last recent_years all-star games
"""
players_list = []
for year in xrange(2017 - recent_years + 1, 2018):
url = 'http://www.basketball-reference.com/allstar/NBA_{}.html'.format(year)
r = requests.get(url)
r.raise_for_status()
html = lx.fromstring(r.text)
players_per_year = html.xpath("//th[@data-stat = 'player']/a")
players_list += [p.text_content() for p in players_per_year]
# filter out those players who only played once in recent_years all star games
out = [p for p in players_list if players_list.count(p) > 1]
return list(set(out))
def rookie_players(draft_year):
"""
Input: draft year
Output: rookies in the draft_year
"""
url = 'http://www.basketball-reference.com/draft/NBA_{}.html'.format(draft_year)
r = requests.get(url)
r.raise_for_status()
html = lx.fromstring(r.text)
players = html.xpath("//table/tbody/tr/td[@data-stat = 'player']")
return [p.text_content() for p in players]
def player_stat(player, first_years = 4):
"""
Input: player name
first_years: default value 4
weigths: assign weights to first_years stats
Output: a data frame with each row contains the basic and advanced stats for the player.
"""
urlbase = 'http://www.basketball-reference.com'
urlsearch = 'http://www.basketball-reference.com/search/search.fcgi'
# search player: use the search system to locate the url of the page that contains stats for player
r_search = requests.get(urlsearch, params = {'search': player})
r_search.raise_for_status()
html_search = lx.fromstring(r_search.text)
url_suffix = html_search.xpath("//div[@class = 'search-item-name']/strong/a/@href")
url = urlbase + url_suffix[0]
# player page
r = requests.get(url)
r.raise_for_status()
r_html = lx.fromstring(r.text)
# player basic data: stats names and values
pg_stat_col_names = r_html.xpath("//div[@id = 'all_per_game']//table/thead//th")
pg_stat_col_names = [i.text_content() for i in pg_stat_col_names]
pg_stat_data = []
pg_stat_data_tmp = r_html.xpath("//div[@id = 'all_per_game']//table/tbody/tr")
for each_year in pg_stat_data_tmp[0:first_years]:
pg_stat_data.append([i.text_content() for i in each_year.xpath("./*")])
pg_stat_df = pd.DataFrame(pg_stat_data, columns = pg_stat_col_names)
# player advanced data: data hiden in the comment
ad_html_text = r_html.xpath("//div[@id = 'all_advanced']/comment()")[0]
ad_html = lx.fromstring(str(ad_html_text)[4:-3])
ad_stat_col_names = ad_html.xpath("//table/thead//th")
ad_stat_col_names = [i.text_content() for i in ad_stat_col_names if i.text_content() != u'\xa0']
ad_stat_data = []
ad_stat_data_tmp = ad_html.xpath("//table/tbody/tr")
for each_year in ad_stat_data_tmp[0:first_years]:
ad_stat_data.append([i.text_content() for i in each_year.xpath("./*") if i.text_content() != ''])
ad_stat_df = pd.DataFrame(ad_stat_data, columns = ad_stat_col_names)
del ad_stat_df['MP']
mutual_col = pg_stat_df.columns[0:6]
out_df = pd.merge(pg_stat_df, ad_stat_df, on = list(mutual_col))
del out_df['Tm'], out_df['Pos'], out_df['Lg'], out_df['Age'], out_df['Season']
# apply weights to each year
weights = np.arange(1, first_years + 1) / float(np.arange(1, first_years + 1).sum())
out_df = out_df.apply(pd.to_numeric)
out_df = out_df.mul(weights, axis = 0)
out_df['Player'] = [player] * first_years
return out_df
def rookie_df(first_years, draft_year = 2013):
"""
Input: this year - drafted_year cannot be smaller than first_years.
For example, cannot request the first two-year stats for a one-year rookie player
Output: a data frame contains draft_year drafted rookies' stats so far,
each player's stats are reshaped to be one row
"""
df_list=[]
for player_name in rookie_players(draft_year):
try:
df_element = player_stat(player_name, first_years)
del df_element['Player']
pd_array = np.array(df_element)
df_element2 = pd.DataFrame(pd_array.reshape(1,pd_array.shape[0]*pd_array.shape[1]),\
columns=[x+'_'+str(y) for y in range(1,first_years+1) \
for x in df_element.columns])
df_element2['Player'] = player_name
df_list.append(df_element2)
except:
pass
df = pd.concat(df_list,axis=0)
df.reset_index(drop=True,inplace=True)
return df
def star_df(first_years, recent_years = 5):
"""
Input: first_years, recent_years
Output: a data frame contains the first_years stats for recent_years all star player,
each row corresponds to one player
"""
df_list=[]
for player_name in allstar_players(recent_years):
try:
df_element = player_stat(player_name, first_years)
player_name = df_element.Player[0]
del df_element['Player']
pd_array = np.array(df_element)
df_element2 = pd.DataFrame(pd_array.reshape(1,pd_array.shape[0]*pd_array.shape[1]),\
columns=[x+'_'+str(y) for y in range(1,first_years+1) \
for x in df_element.columns])
df_element2['Player'] = player_name
df_list.append(df_element2)
except:
pass
df = pd.concat(df_list, axis=0)
df.reset_index(drop=True,inplace=True)
return df
r = rookie_df(4)
s = star_df(4)
X = pd.concat([r, s], axis=0, ignore_index=True)
def empty_to_Zero(elt):
if elt == '' or elt == None:
return 0
else:
return elt
X = X.applymap(empty_to_Zero)
X = X.fillna(0)
X.to_csv('total.csv', index = False)
X_player = X['Player']
X_stat = X.drop('Player', axis=1)
kmeans = KMeans(n_clusters=5, random_state=28, max_iter=10, algorithm='full')
kmeans.fit(X_stat)
cluster_label = kmeans.labels_
# find the optimal k
sse = []
slh_score = []
for k in range(2, 11):
kms = KMeans(n_clusters=k, random_state=0, max_iter=10)
kms.fit(X_stat)
sse.append(kms.inertia_)
label = kms.labels_
slh_score.append(metrics.silhouette_score(X_stat, label, metric='euclidean'))
fig = plt.figure(figsize=(7, 5))
plt.plot(range(2, 11), slh_score, '-o')
plt.title('Silhouette score for numbers of clusters k')
plt.xlabel('k')
plt.ylabel('silhouette score')
plt.annotate('optimal k', xy=(5, slh_score[3]), xytext=(5.5, 0.28),
arrowprops=dict(facecolor='black', shrink=0.1, width=1, headwidth=5),
)
plt.show()
fig.savefig('./slh.png', dpi=400)
pca = PCA(n_components=2)
pca.fit(scale(X_stat))
X_new = pca.transform(scale(X_stat))
df_plot = pd.concat([X_player, pd.DataFrame(X_new), pd.Series(cluster_label)], axis = 1)
df_plot.columns = ['player', 'PC1', 'PC2', 'cluster']
star_roster = allstar_players(5)
df_plot['is_star'] = df_plot.player.isin(star_roster)
df_plot.cluster = df_plot.cluster.astype('category')
df_plot.to_csv('./df_plot.csv', index=False)
def rookie_player_avg(first_years):
"""This function can give a weighted average of first input years rookie players stats.
input: first_years(int): the years the rookie players in. like 4 is for rookie players in 2017-4 = 2013. So it's for 2013 rookie players
output: a pandas dataframe contains of a weighted average of first input years players stats.
"""
df_list=[]
for i in rookie_players(2017-first_years):
try:
df_element = player_stat(i, first_years)
df_element = df_element.applymap(empty_to_Zero)
player_name = df_element.Player[0]
del df_element['Player'],df_element['Season']
for i in df_element.columns:
df_element[i] = df_element.loc[:,i].astype(float).dot([0.1,0.2,0.3,0.4])
df_element = pd.DataFrame(df_element.iloc[0,:]).T
df_element['Player'] = player_name
df_list.append(df_element)
except:
pass
df = pd.concat(df_list,axis=0)
df.reset_index(drop=True,inplace=True)
return df
def star_player_avg(first_years,recent_years=5):
"""This function can give a weighted average of first input years star players stats.
input: first_years(int): the first input years for the star players. exg: 4 is means to get the first 4 years of stats of the star players
recent_years(int): mean for recent input years star players. exg: 5 mean for recent 5 years all star players.
output: a pandas dataframe contains of a weighted average of first input years players stats.
"""
df_list=[]
for i in allstar_players(recent_years):
try:
df_element = player_stat(i, first_years)
df_element = df_element.applymap(empty_to_Zero)
player_name = df_element.Player[0]
del df_element['Player'],df_element['Season']
for i in df_element.columns:
df_element[i] = df_element.loc[:,i].astype(float).dot([0.1,0.2,0.3,0.4])
df_element = pd.DataFrame(df_element.iloc[0,:]).T
df_element['Player'] = player_name
df_list.append(df_element)
except:
pass
df = pd.concat(df_list,axis=0)
df.reset_index(drop=True,inplace=True)
return df
rookie_avg = rookie_player_avg(4)
star_avg = star_player_avg(4)
Z_avg = pd.concat([rookie_avg,star_avg],axis=0)
Z_avg.reset_index(drop=True,inplace=True)
Z_avg.to_csv('Total_avg.csv')
def specific_stats(player,year):
"""Get the specific player regular stats(regular seasons) in every game for the first input years.
input: player(string): Player's full name.
year(int): first few year.
output(pandas DataFrame): the input player regular stats(regular seasons) in every game for the first input years.
"""
urlbase = 'http://www.basketball-reference.com'
urlsearch = 'http://www.basketball-reference.com/search/search.fcgi'
# search player
r_search = requests.get(urlsearch, params = {'search': player})
r_search.raise_for_status()
html_search = lx.fromstring(r_search.text)
url_suffix = html_search.xpath("//div[@class = 'search-item-name']/strong/a/@href")
url = urlbase + url_suffix[0]
# player specific data url
r = requests.get(url)
r.raise_for_status()
r_html = lx.fromstring(r.text)
pg_stat_year = r_html.xpath("//div[@id = 'all_per_game']//table/tbody//tr/th")
pg_stat_year = [i.text_content() for i in pg_stat_year][:year]
log_url = [url.replace(".html", "/gamelog/"+str(int(i[:4])+1)+'/') for i in pg_stat_year]
# each seanson stats
df_list = []
for i in log_url:
r = requests.get(i)
r.raise_for_status()
r_html = lx.fromstring(r.text)
pg_stat_col_names = r_html.xpath("//div[@id = 'div_pgl_basic']//table/thead//th")
pg_stat_col_names = [i.text_content() for i in pg_stat_col_names]
pg_stat_data = []
pg_stat_data_tmp = r_html.xpath("//div[@id = 'div_pgl_basic']//table/tbody/tr")
for each_year in pg_stat_data_tmp:
pg_stat_data.append([i.text_content() for i in each_year.xpath("./*")])
pg_stat_df = pd.DataFrame(pg_stat_data, columns = pg_stat_col_names)
df_list.append(pg_stat_df)
df = pd.concat(df_list,axis=0)
df = df.drop(['Date','Rk','G','Age','Tm','Opp','GS'], axis=1)
df = df.iloc[:,2:]
df = df[df.MP != 'MP']
df.reset_index(drop=True,inplace=True)
df['Player']=player
df = df.applymap(empty_to_Zero)
return df
def compare(Player1,Player2,category,year = 4 ):
""" This function generate a pandas DataFrame contains the input 2 players every game(regular seasons) a specific category stats in the first input years.
input: Player1(string): Player1 full name.
player2(string): Player2 full name.
category(string): specific stats to compare such as 3P%.
year(int): first frew input years.
output: a a pandas DataFrame contains the input 2 players every game(regular seasons) a specific category stats in the first input years.
"""
df1 = specific_stats(Player1,year)
df2 = specific_stats(Player2,year)
df = pd.concat([df1,df2],axis=0)
df.reset_index(drop=True,inplace=True)
df = df[[category,'Player']]
return df
Next we plot the histgram of 4 pairs of players of their specific stats in first 4 years.
Z = compare('James Harden','Michael Cart-Williams','3P%')
Z = Z.applymap(empty_to_Zero)
Y1 = np.array(Z.loc[Z['Player']=='James Harden','3P%']).astype(float)
Y2 = np.array(Z.loc[Z['Player']=='Michael Cart-Williams','3P%']).astype(float)
fig = plt.figure()
plt.hist(Y1,bins=15,color='g',alpha = 0.7,label='James Harden',)
plt.hist(Y2,bins=15,color='c',alpha =0.7,label='Michael Cart-Williams')
plt.legend()
plt.title('3 point rate')
fig.savefig('James_Michael.png')
plt.close(fig)
Z = compare('Draymond Green','Kelly Olynyk','TRB')
Z = Z.applymap(empty_to_Zero)
Y1 = np.array(Z.loc[Z['Player']=='Draymond Green','TRB']).astype(float)
Y2 = np.array(Z.loc[Z['Player']=='Kelly Olynyk','TRB']).astype(float)
fig = plt.figure()
plt.hist(Y1,bins=15,color='g',alpha = 0.7,label='Draymond Green')
plt.hist(Y2,bins=15,color='c',alpha =0.7,label='Kelly Olynyk')
plt.legend()
plt.title('Total Rebound')
fig.savefig('Green_Olynyk.png')
plt.close(fig)
Z = compare('Paul Millsap','Nerlen Noel','BLK')
Z = Z.applymap(empty_to_Zero)
Y1 = np.array(Z.loc[Z['Player']=='Paul Millsap','BLK']).astype(float)
Y2 = np.array(Z.loc[Z['Player']=='Nerlen Noel','BLK']).astype(float)
fig = plt.figure()
plt.hist(Y1,bins=15,color='g',alpha = 0.7,label='Paul Millsap')
plt.hist(Y2,bins=15,color='c',alpha =0.7,label='Nerlen Noel')
plt.legend()
plt.title('Blocks')
fig.savefig('Millsap_Noel.png')
plt.close(fig)
Z = compare('Tony Parker','Victor Oladipo','FG%')
Z = Z.applymap(empty_to_Zero)
Y1 = np.array(Z.loc[Z['Player']=='Tony Parker','FG%']).astype(float)
Y2 = np.array(Z.loc[Z['Player']=='Victor Oladipo','FG%']).astype(float)
fig = plt.figure()
plt.hist(Y1,bins=15,color='g',alpha = 0.7,label='Tony Parker')
plt.hist(Y2,bins=15,color='c',alpha =0.7,label='Victor Oladipo')
plt.legend()
plt.title('Field Goal%')
fig.savefig('Parker_Oladipo.png')
plt.close(fig)
Total_Avg = pd.DataFrame.from_csv('Total_Avg.csv')
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sklearn
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn import preprocessing
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
from sklearn import datasets
X = Total_Avg.ix[:,0:46]
X_scaled = preprocessing.scale(X)
n_components = 3
pca = IncrementalPCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)
np.random.seed(5)
centers = [[1, 1], [-1, -1], [1, -1]]
estimators = {'k_means_iris_3': KMeans(n_clusters=3),
'k_means_iris_8': KMeans(n_clusters=8),
'k_means_iris_bad_init': KMeans(n_clusters=5, n_init=1,
init='random')}
fignum = 1
for name, est in estimators.items():
fig = plt.figure(fignum, figsize=(4, 3))
plt.clf()
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
plt.cla()
est.fit(X_pca)
labels = est.labels_
ax.scatter(X_pca[:, 1], X_pca[:, 0], X_pca[:, 2], c=labels.astype(np.float))
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_xlabel('PC2')
ax.set_ylabel('PC1')
ax.set_zlabel('PC3')
fignum = fignum + 1
# Plot the ground truth
fig = plt.figure(fignum, figsize=(4, 3))
plt.clf()
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
plt.cla()
plt.show()




import time as time
import numpy as np
import matplotlib.pyplot as plt
import mpl_toolkits.mplot3d.axes3d as p3
from sklearn.cluster import AgglomerativeClustering
from sklearn.datasets.samples_generator import make_swiss_roll
print("Compute unstructured hierarchical clustering...")
st = time.time()
ward = AgglomerativeClustering(n_clusters=5, linkage='ward').fit(X_pca)
elapsed_time = time.time() - st
label = ward.labels_
print("Elapsed time: %.2fs" % elapsed_time)
print("Number of points: %i" % label.size)
Compute unstructured hierarchical clustering...
Elapsed time: 0.00s
Number of points: 84
fig = plt.figure()
ax = p3.Axes3D(fig)
ax.view_init(7, -80)
for l in np.unique(label):
ax.plot3D(X_pca[label == l, 0], X_pca[label == l, 1], X_pca[label == l, 2],
'o', color=plt.cm.jet(np.float(l) / np.max(label + 1)))
plt.title('Without connectivity constraints (time %.2fs)' % elapsed_time)
<matplotlib.text.Text at 0x11ba3a710>
plt.show()

from sklearn.neighbors import kneighbors_graph
connectivity = kneighbors_graph(X_pca, n_neighbors=5, include_self=False)
print("Compute structured hierarchical clustering...")
st = time.time()
ward = AgglomerativeClustering(n_clusters=5, connectivity=connectivity,
linkage='ward').fit(X_pca)
elapsed_time = time.time() - st
label = ward.labels_
print("Elapsed time: %.2fs" % elapsed_time)
print("Number of points: %i" % label.size)
Compute structured hierarchical clustering...
Elapsed time: 0.01s
Number of points: 84
fig = plt.figure()
ax = p3.Axes3D(fig)
ax.view_init(10, -10)
for l in np.unique(label):
ax.plot3D(X_pca[label == l, 0], X_pca[label == l, 1], X_pca[label == l, 2],
'o', color=plt.cm.jet(float(l) / np.max(label + 1)))
plt.title('With connectivity constraints (time %.2fs)' % elapsed_time)
<matplotlib.text.Text at 0x11bc3e0b8>
plt.show()

import time
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.neighbors import kneighbors_graph
n_components2 = 2
pca2 = IncrementalPCA(n_components=n_components2)
X_pca2 = pca2.fit_transform(X_scaled)
knn_graph = kneighbors_graph(X_pca2, 5, include_self=False)
plt.figure(figsize=(15, 4))
<matplotlib.figure.Figure at 0x11c42f588>
for index, linkage in enumerate(('average', 'complete', 'ward')):
plt.subplot(1, 3, index + 1)
model = AgglomerativeClustering(linkage=linkage, connectivity=knn_graph, n_clusters= 5)
t0 = time.time()
model.fit(X)
elapsed_time = time.time() - t0
plt.scatter(X_pca2[:, 0], X_pca2[:, 1], c=model.labels_, cmap=plt.cm.spectral)
plt.title('linkage=%s (time %.2fs)' % (linkage, elapsed_time), fontdict=dict(verticalalignment='top'))
plt.axis('equal')
plt.axis('off')
plt.subplots_adjust(bottom=0, top=.89, wspace=0,left=0, right=1)
plt.suptitle('n_cluster=%i, connectivity=%r' %
(5, knn_graph is not None), size=17)
plt.show()
