freelance-project-34-market.../python/tasks/mlb_player.py
Siarhei Siniak 49951a7373 [~] Refactor
2021-07-19 10:18:06 +03:00

211 lines
4.7 KiB
Python

import pprint
import xarray
import numpy
import json
import glob
import io
import os
import pandas
import pickle
def kernel_1():
t4 = 'kernel_1-t3.dat'
def preprocess(t4):
t1 = '/kaggle/input/mlb-player-digital-engagement-forecasting'
t2 = glob.glob(
os.path.join(
t1,
'*.csv'
)
)
t3 = {
o : pandas.read_csv(o).to_xarray()
for o in t2
}
with io.open(t4, 'wb') as f:
pickle.dump(t3, f)
if not os.path.exists(t4):
preprocess(t4=t4)
with io.open(t4, 'rb') as f:
t3 = pickle.load(f)
return dict(
t3=t3,
)
def kernel_2(
o_1=None,
):
t1 = {}
for k in [
'playerTwitterFollowers',
'teamTwitterFollowers',
'games',
'events'
]:
t4 = '%s.nc' % k
if not os.path.exists(t4):
print('started %s' % t4)
t2 = '/kaggle/input/mlb-player-digital-engagement-forecasting/train.csv'
t3 = pandas.DataFrame(
sum(
[
json.loads(o)
for o in o_1['t3'][t2][k].values
if isinstance(o, str)
],
[]
)
).to_xarray()
t3.to_netcdf(t4)
print('cached %s' % t4)
if k == 'events':
t5 = '%s-v2.nc' % k
if not os.path.exists(t5):
t2 = xarray.load_dataset(t4)
t3 = t2.sel(
index=numpy.arange(
2017653 - 10 * 1000,
2017653 + 1
)
)
t3.to_netcdf(t5)
t1[k] = xarray.load_dataset(t5)
print('loaded %s' % t5)
else:
t1[k] = xarray.load_dataset(t4)
print('loaded %s' % t4)
return dict(
t1=t1,
)
def kernel_3(should_exist=None):
if should_exist is None:
should_exist = False
t3 = [
('playerTwitterFollowers', None),
('teamTwitterFollowers', None),
('games', None),
('events', 'events-v2.nc'),
]
o_1 = None
o_2 = None
t4 = '/kaggle/input/garbage'
t5 = {}
for k, v in t3:
if v is None:
t1 = os.path.join(
t4,
'%s.nc' % k,
)
else:
t1 = os.path.join(
t4,
v,
)
if os.path.exists(t1):
t2 = xarray.load_dataset(t1)
else:
if should_exist:
pprint.pprint([k, v, t1])
raise NotImplementedError
if o_1 is None:
o_1 = kernel_1()
if o_2 is None:
o_2 = kernel_2(
o_1=o_1
)
t2 = o_2['events']
t5[k] = t2
return dict(
t5=t5,
)
def kernel_4(
o_3=None,
):
[
print(
o_3['t5']['events'].to_dataframe().iloc[k].to_json(indent=4)
)
for k in range(-10, -1)
]
[
print(
o_3['t5']['games'].to_dataframe().iloc[k].to_json(indent=4)
)
for k in range(-10, -1)
]
t4 = 'https://www.youtube.com/watch?v=reaC7BHgL3M'
r"""
{
"gamePk":634280,
"gameType":"R",
"season":2021,
"gameDate":"2021-04-30",
"gameTimeUTC":"2021-04-30T23:37:00Z",
"resumeDate":"",
"resumedFrom":"",
"codedGameState":"F",
"detailedGameState":"Final",
"isTie":0.0,
"gameNumber":1,
"doubleHeader":"N",
"dayNight":"night",
"scheduledInnings":9,
"gamesInSeries":3.0,
"seriesDescription":"Regular Season",
"homeId":141,
"homeName":"Toronto Blue Jays",
"homeAbbrev":"TOR",
"homeWins":12,
"homeLosses":12,
"homeWinPct":0.5,
"homeWinner":true,
"homeScore":13.0,
"awayId":144,
"awayName":"Atlanta Braves",
"awayAbbrev":"ATL",
"awayWins":12.0,
"awayLosses":14.0,
"awayWinPct":0.462,
"awayWinner":false,
"awayScore":5.0
}
"""
t1 = numpy.where(o_3['t5']['events']['gamePk'] == 634280)[0]
t5 = o_3['t5']['events'].index.data
t6 = t5[t1]
t2 = o_3['t5']['events'].sel(index=t6)
t3 = o_3['t5']['games'].to_dataframe().iloc[-19].to_dict()
pprint.pprint(t3)
assert t3['gamePk'] == 634280
dict(
t2=t2,
t3=t3,
t4=t4,
)