import warnings;
warnings.simplefilter(action='ignore', category=FutureWarning);
warnings.simplefilter(action='ignore', category=DeprecationWarning);

# CONDA LIBRARY IMPORTS
import seaborn as sns;
import pandas as pds;
import numpy as npy;

# start with one, then merge the rest in a loop(s)
MLB = pds.read_csv("158_data.csv");

# csv files numbered 108 through 121 merged here
for index in range(14):
    index += 108;
    data = pds.read_csv("{0}_data.csv".format(index));
    MLB = pds.concat([MLB, data]);

# csv files numbered 133 through 147 merged here
for index in range(15):
    index += 133;
    data = pds.read_csv("{0}_data.csv".format(index));
    MLB = pds.concat([MLB, data]);

# sanity check size & format to show merge was successful
print("\nSize of data matrix is {0}\n\n".format(MLB.shape));
#TODO perhaps randomize the values here before the print? TODO

MLB.head()

Size of data matrix is (23703, 92)

MLB = MLB[
    [ "pitch_name", "pitch_type",
    "release_speed", "effective_speed",
    "release_pos_x", "release_pos_y", "release_pos_z",
    "ax", "ay", "vx0", "vy0", "vz0",
    "pfx_x", "pfx_z", "sz_top", "sz_bot",
    "spin_axis", "stand", "p_throws",
    "release_extension", "release_spin_rate",
    "plate_x", "plate_z",
    "events", "description" ]
];

MLB.dropna();

data = MLB.select_dtypes(include="number")

Q25 = data.quantile(0.25);
Q75 = data.quantile(0.75);
IQR = Q75 - Q25;
MIN = -1.5*IQR + Q25;
MAX = 1.5*IQR + Q75;

outliers = data[(data < MIN) | (data > MAX)].count()

print(outliers)

release_speed         242
effective_speed       304
release_pos_x           0
release_pos_y         255
release_pos_z         532
ax                      0
ay                     63
vx0                    88
vy0                   247
vz0                   105
pfx_x                   0
pfx_z                 297
sz_top                204
sz_bot                216
spin_axis             146
release_extension     348
release_spin_rate    1321
plate_x                16
plate_z                64
dtype: int64

print("By the names alone, the strong correlation  here is sort of stupidly obvious, but done still as a vibe check for the usefulness of this data set")
print(data[["release_speed", "effective_speed"]].corr())
print()
print("A bit more interestingly here, (as noted above) vertical starting position means much more to vertical movement than the same for horizontal")
print(data[["release_pos_x", "pfx_x"]].corr())
print()
print(data[["release_pos_z", "pfx_z"]].corr())
print()
print("another seemingly trivial correlation, but still helpful to note the ball trends toward the center of the strike zone in high count situations")
print("that is to say, in fuller counts (at least 3-1 or 2-2) the ball stays closer to the bottom when it gets far from the top, or vice versa from top")
print(MLB[["sz_top", "sz_bot"]].corr())

By the names alone, the strong correlation  here is sort of stupidly obvious, but done still as a vibe check for the usefulness of this data set
                 release_speed  effective_speed
release_speed         1.000000         0.846694
effective_speed       0.846694         1.000000

A bit more interestingly here, (as noted above) vertical starting position means much more to vertical movement than the same for horizontal
               release_pos_x     pfx_x
release_pos_x       1.000000  0.425303
pfx_x               0.425303  1.000000

               release_pos_z     pfx_z
release_pos_z       1.000000  0.130046
pfx_z               0.130046  1.000000

another seemingly trivial correlation, but still helpful to note the ball trends toward the center of the strike zone in high count situations
that is to say, in fuller counts (at least 3-1 or 2-2) the ball stays closer to the bottom when it gets far from the top, or vice versa from top
          sz_top    sz_bot
sz_top  1.000000  0.807802
sz_bot  0.807802  1.000000

data.describe()

sns.catplot(data=data, col="pitch_type", x="release_speed", y="release_spin_rate", kind="box")

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[6], line 3
      1 data.describe()
----> 3 sns.catplot(data=data, col="pitch_type", x="release_speed", y="release_spin_rate", kind="box")

File ~/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/categorical.py:2781, in catplot(data, x, y, hue, row, col, kind, estimator, errorbar, n_boot, seed, units, weights, order, hue_order, row_order, col_order, col_wrap, height, aspect, log_scale, native_scale, formatter, orient, color, palette, hue_norm, legend, legend_out, sharex, sharey, margin_titles, facet_kws, ci, **kwargs)
   2778     elif x is not None and y is not None:
   2779         raise ValueError("Cannot pass values for both `x` and `y`.")
-> 2781 p = Plotter(
   2782     data=data,
   2783     variables=dict(
   2784         x=x, y=y, hue=hue, row=row, col=col, units=units, weight=weights
   2785     ),
   2786     order=order,
   2787     orient=orient,
   2788     # Handle special backwards compatibility where pointplot originally
   2789     # did *not* default to multi-colored unless a palette was specified.
   2790     color="C0" if kind == "point" and palette is None and color is None else color,
   2791     legend=legend,
   2792 )
   2794 for var in ["row", "col"]:
   2795     # Handle faceting variables that lack name information
   2796     if var in p.variables and p.variables[var] is None:

File ~/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/categorical.py:66, in _CategoricalPlotter.__init__(self, data, variables, order, orient, require_numeric, color, legend)
     55 def __init__(
     56     self,
     57     data=None,
   (...)
     63     legend="auto",
     64 ):
---> 66     super().__init__(data=data, variables=variables)
     68     # This method takes care of some bookkeeping that is necessary because the
     69     # original categorical plots (prior to the 2021 refactor) had some rules that
     70     # don't fit exactly into VectorPlotter logic. It may be wise to have a second
   (...)
     75     # default VectorPlotter rules. If we do decide to make orient part of the
     76     # _base variable assignment, we'll want to figure out how to express that.
     77     if self.input_format == "wide" and orient in ["h", "y"]:

File ~/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:634, in VectorPlotter.__init__(self, data, variables)
    629 # var_ordered is relevant only for categorical axis variables, and may
    630 # be better handled by an internal axis information object that tracks
    631 # such information and is set up by the scale_* methods. The analogous
    632 # information for numeric axes would be information about log scales.
    633 self._var_ordered = {"x": False, "y": False}  # alt., used DefaultDict
--> 634 self.assign_variables(data, variables)
    636 # TODO Lots of tests assume that these are called to initialize the
    637 # mappings to default values on class initialization. I'd prefer to
    638 # move away from that and only have a mapping when explicitly called.
    639 for var in ["hue", "size", "style"]:

File ~/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:679, in VectorPlotter.assign_variables(self, data, variables)
    674 else:
    675     # When dealing with long-form input, use the newer PlotData
    676     # object (internal but introduced for the objects interface)
    677     # to centralize / standardize data consumption logic.
    678     self.input_format = "long"
--> 679     plot_data = PlotData(data, variables)
    680     frame = plot_data.frame
    681     names = plot_data.names

File ~/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_core/data.py:58, in PlotData.__init__(self, data, variables)
     51 def __init__(
     52     self,
     53     data: DataSource,
     54     variables: dict[str, VariableSpec],
     55 ):
     57     data = handle_data_source(data)
---> 58     frame, names, ids = self._assign_variables(data, variables)
     60     self.frame = frame
     61     self.names = names

File ~/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_core/data.py:232, in PlotData._assign_variables(self, data, variables)
    230     else:
    231         err += "An entry with this name does not appear in `data`."
--> 232     raise ValueError(err)
    234 else:
    235 
    236     # Otherwise, assume the value somehow represents data
    237 
    238     # Ignore empty data structures
    239     if isinstance(val, Sized) and len(val) == 0:

ValueError: Could not interpret value `pitch_type` for `col`. An entry with this name does not appear in `data`.

	pitch_type	game_date	release_speed	release_pos_x	release_pos_z	player_name	batter	pitcher	events	description	...	fld_score	post_away_score	post_home_score	post_bat_score	post_fld_score	if_fielding_alignment	of_fielding_alignment	spin_axis	delta_home_win_exp	delta_run_exp
0	FF	2023-10-01	92.4	-1.10	6.07	Crow-Armstrong, Pete	691718	605288	walk	ball	...	3	0	3	0	3	Standard	Standard	222.0	-0.012	0.110
1	FF	2023-10-01	92.9	-1.06	6.11	Canario, Alexander	672744	605288	strikeout	called_strike	...	3	0	3	0	3	Standard	Standard	214.0	0.021	-0.152
2	CU	2023-10-01	83.5	-1.41	5.35	Tauchman, Mike	643565	676083	strikeout	called_strike	...	4	0	4	0	4	Standard	Standard	39.0	0.008	-0.152
3	SI	2023-10-01	92.2	-1.47	5.78	Tauchman, Mike	643565	605288	walk	ball	...	0	0	0	0	0	Standard	Standard	227.0	-0.025	0.151
4	FC	2023-09-30	86.9	1.50	6.02	Gomes, Yan	543228	641778	walk	ball	...	6	6	6	6	6	Standard	Standard	160.0	-0.015	0.052

MATH 219 | Project 1 March 22, 2024¶

0: [IGNORABLE BOILERPLATE]¶

1: [INTRODUCTION]¶

2: [Preprocessing]¶

The remaining code after the column filtering is "by-the-book" pre-processing and should be self-explanatory.¶

SOME IMPORTED COLUMN DESCRIPTIONS¶

release_pos (x, y, z):: 'Y' is the relative travel distance to get to the strike zone, and 'X/Z' are horizontal and vertical offsets, respectively.¶

pfx (x/z):: total horizontal and vertical movement of the ball, respectively, after it's release, relative to the catchers mitt.¶

plate (x/z) horizontal and vertical position, respectively, of the pitch as it crosses home plate, relative to the catchers mitt. This is independant of the strike zone, which is different for ever batter.¶

release speed:: magnitude of pitch velocity at it's release point, in the absolute direction of it's release vector (100%).¶

effective speed:: an adjustment to release speed, based on release extension and release vector. ball speed towards the strike zone.¶

sz (top/bottom):: The top and the bottom heights of the hitter's strike zone (auto-set when the ball is halfway to the plate).¶

spin axis:: the axis the pitched ball rotates on; 180 degrees = in direction of strike zone, 0 or 360 = direction of pitchers mound.¶

a (x/y) acceleration of the pitch out of the pitchers hand, or at the point just before release.¶

vx0/vy0/vz0:: this is the speed vector of the ball as it is caught by the catcher past the strike zone.¶

OUTLIER COUNTING FOR QUANTATIVE COLUMNS:¶

3: [SUMMARY DATA ANALYSIS]¶

4: [DISCUSSION]¶

{1} Is it desirable or even fair for the umpire position to be performed or least assisted (triggering call reviews) by an AI trained system?¶

{2}Given only the real time information about a pitch before it's caught, such as it's release position, initial movement and spin, is it possible to not just categorize its call, but predict it's caught position inside or outside of the strike zone?¶