211 lines
4.7 KiB
Python
211 lines
4.7 KiB
Python
import pprint
|
|
import xarray
|
|
import numpy
|
|
import json
|
|
import glob
|
|
import io
|
|
import os
|
|
import pandas
|
|
import pickle
|
|
|
|
def kernel_1():
|
|
t4 = 'kernel_1-t3.dat'
|
|
|
|
def preprocess(t4):
|
|
t1 = '/kaggle/input/mlb-player-digital-engagement-forecasting'
|
|
t2 = glob.glob(
|
|
os.path.join(
|
|
t1,
|
|
'*.csv'
|
|
)
|
|
)
|
|
|
|
t3 = {
|
|
o : pandas.read_csv(o).to_xarray()
|
|
for o in t2
|
|
}
|
|
|
|
with io.open(t4, 'wb') as f:
|
|
pickle.dump(t3, f)
|
|
|
|
if not os.path.exists(t4):
|
|
preprocess(t4=t4)
|
|
|
|
with io.open(t4, 'rb') as f:
|
|
t3 = pickle.load(f)
|
|
|
|
|
|
return dict(
|
|
t3=t3,
|
|
)
|
|
|
|
def kernel_2(
|
|
o_1=None,
|
|
):
|
|
t1 = {}
|
|
|
|
for k in [
|
|
'playerTwitterFollowers',
|
|
'teamTwitterFollowers',
|
|
'games',
|
|
'events'
|
|
]:
|
|
t4 = '%s.nc' % k
|
|
if not os.path.exists(t4):
|
|
print('started %s' % t4)
|
|
t2 = '/kaggle/input/mlb-player-digital-engagement-forecasting/train.csv'
|
|
t3 = pandas.DataFrame(
|
|
sum(
|
|
[
|
|
json.loads(o)
|
|
for o in o_1['t3'][t2][k].values
|
|
if isinstance(o, str)
|
|
],
|
|
[]
|
|
)
|
|
).to_xarray()
|
|
t3.to_netcdf(t4)
|
|
print('cached %s' % t4)
|
|
|
|
if k == 'events':
|
|
t5 = '%s-v2.nc' % k
|
|
if not os.path.exists(t5):
|
|
t2 = xarray.load_dataset(t4)
|
|
t3 = t2.sel(
|
|
index=numpy.arange(
|
|
2017653 - 10 * 1000,
|
|
2017653 + 1
|
|
)
|
|
)
|
|
t3.to_netcdf(t5)
|
|
t1[k] = xarray.load_dataset(t5)
|
|
print('loaded %s' % t5)
|
|
else:
|
|
t1[k] = xarray.load_dataset(t4)
|
|
print('loaded %s' % t4)
|
|
|
|
|
|
return dict(
|
|
t1=t1,
|
|
)
|
|
|
|
def kernel_3(should_exist=None):
|
|
if should_exist is None:
|
|
should_exist = False
|
|
|
|
t3 = [
|
|
('playerTwitterFollowers', None),
|
|
('teamTwitterFollowers', None),
|
|
('games', None),
|
|
('events', 'events-v2.nc'),
|
|
]
|
|
|
|
o_1 = None
|
|
o_2 = None
|
|
|
|
t4 = '/kaggle/input/garbage'
|
|
t5 = {}
|
|
for k, v in t3:
|
|
if v is None:
|
|
t1 = os.path.join(
|
|
t4,
|
|
'%s.nc' % k,
|
|
)
|
|
else:
|
|
t1 = os.path.join(
|
|
t4,
|
|
v,
|
|
)
|
|
|
|
if os.path.exists(t1):
|
|
t2 = xarray.load_dataset(t1)
|
|
else:
|
|
if should_exist:
|
|
pprint.pprint([k, v, t1])
|
|
raise NotImplementedError
|
|
|
|
if o_1 is None:
|
|
o_1 = kernel_1()
|
|
if o_2 is None:
|
|
o_2 = kernel_2(
|
|
o_1=o_1
|
|
)
|
|
|
|
t2 = o_2['events']
|
|
t5[k] = t2
|
|
|
|
return dict(
|
|
t5=t5,
|
|
)
|
|
|
|
def kernel_4(
|
|
o_3=None,
|
|
):
|
|
[
|
|
print(
|
|
o_3['t5']['events'].to_dataframe().iloc[k].to_json(indent=4)
|
|
)
|
|
for k in range(-10, -1)
|
|
]
|
|
|
|
[
|
|
print(
|
|
o_3['t5']['games'].to_dataframe().iloc[k].to_json(indent=4)
|
|
)
|
|
for k in range(-10, -1)
|
|
]
|
|
|
|
|
|
t4 = 'https://www.youtube.com/watch?v=reaC7BHgL3M'
|
|
|
|
r"""
|
|
{
|
|
"gamePk":634280,
|
|
"gameType":"R",
|
|
"season":2021,
|
|
"gameDate":"2021-04-30",
|
|
"gameTimeUTC":"2021-04-30T23:37:00Z",
|
|
"resumeDate":"",
|
|
"resumedFrom":"",
|
|
"codedGameState":"F",
|
|
"detailedGameState":"Final",
|
|
"isTie":0.0,
|
|
"gameNumber":1,
|
|
"doubleHeader":"N",
|
|
"dayNight":"night",
|
|
"scheduledInnings":9,
|
|
"gamesInSeries":3.0,
|
|
"seriesDescription":"Regular Season",
|
|
"homeId":141,
|
|
"homeName":"Toronto Blue Jays",
|
|
"homeAbbrev":"TOR",
|
|
"homeWins":12,
|
|
"homeLosses":12,
|
|
"homeWinPct":0.5,
|
|
"homeWinner":true,
|
|
"homeScore":13.0,
|
|
"awayId":144,
|
|
"awayName":"Atlanta Braves",
|
|
"awayAbbrev":"ATL",
|
|
"awayWins":12.0,
|
|
"awayLosses":14.0,
|
|
"awayWinPct":0.462,
|
|
"awayWinner":false,
|
|
"awayScore":5.0
|
|
}
|
|
"""
|
|
|
|
t1 = numpy.where(o_3['t5']['events']['gamePk'] == 634280)[0]
|
|
t5 = o_3['t5']['events'].index.data
|
|
t6 = t5[t1]
|
|
t2 = o_3['t5']['events'].sel(index=t6)
|
|
t3 = o_3['t5']['games'].to_dataframe().iloc[-1].to_dict()
|
|
pprint.pprint(t3)
|
|
assert t3['gamePk'] == 634280
|
|
|
|
dict(
|
|
t2=t2,
|
|
t3=t3,
|
|
t4=t4,
|
|
)
|