import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_absolute_error, r2_score

df = pd.read_csv('./data/pokemon_data.csv')
df = df.drop(columns=['attack_1_name', 'attack_2_name', 'attack_3_name', 'attack_4_name', 'attack_5_name'])  # Dropping attack name due to bug.

print(df.shape)
df.head()

(4558, 23)

text_cols = [col for col in df.columns if 'text' in col]
damage_cols = [col for col in df.columns if 'damage' in col]
attack_cols = [col for col in df.columns if 'attack' in col] # To be dropped later
attack_cost_cols = [col for col in df.columns if 'attack' in col and 'cost' in col]

df[text_cols] = df[text_cols].fillna('')
df['all_attack_text'] = df[text_cols].apply(lambda row: ' '.join(row), axis=1)

for col in damage_cols:
  df[col] = pd.to_numeric(df[col], errors='coerce')
df['num_attacks'] = df[damage_cols].notna().sum(axis=1)
df['max_damage'] = df[damage_cols].max(axis=1)
df['total_damage'] = df[damage_cols].sum(axis=1)
df['avg_damage'] = df[damage_cols].mean(axis=1)

for col in attack_cost_cols:
  df[col] = df[col].fillna('').apply(lambda x: len(str(x).split()))
df['max_energy_cost'] = df[attack_cost_cols].max(axis=1)
df['total_energy_cost'] = df[attack_cost_cols].sum(axis=1)

denominator = df['num_attacks'].clip(lower=1) # Prevent division by zero
df['avg_energy_cost'] = df[attack_cost_cols].sum(axis=1) / denominator

df['hp'] = df['hp'].replace(' HP', '', regex=True)
df['hp'] = pd.to_numeric(df['hp'], errors='coerce')

df.fillna(0, inplace=True)

df = df.drop(columns=attack_cols)

print(df.shape)
df.head()

(4558, 16)

y = df['hp']
X = df.drop(columns=['hp'])

categorical_features = ['type', 'stage', 'weakness', 'resistance', 'rarity']
numerical_features = ['retreat_cost', 'num_attacks', 'max_damage', 'total_damage', 'avg_damage', 'max_energy_cost', 'total_energy_cost', 'avg_energy_cost']
text_features = 'all_attack_text'

X_categorical_encoded = pd.get_dummies(X[categorical_features], drop_first=True)
X_base_features = pd.concat([X[numerical_features], X_categorical_encoded], axis=1)

tfidf = TfidfVectorizer(max_features=500) # Limit to top 500 features to reduce dimensionality
X_text_features = tfidf.fit_transform(X[text_features])
X_text_df = pd.DataFrame(X_text_features.toarray(), columns=tfidf.get_feature_names_out())

X_processed = pd.concat([X_base_features, X_text_df], axis=1)

print(X_processed.shape)
X_processed.head()

(4558, 581)

X_train, X_test, y_train, y_test = train_test_split(
  X_processed,
  y,
  test_size=0.2,
  random_state=1234
)

scaler = StandardScaler(with_mean=False)
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = Ridge(alpha=1.0)
model.fit(X_train_scaled, y_train)

Ridge()

coefficients = pd.Series(model.coef_, index=X_processed.columns)
print(coefficients.sort_values(ascending=False))

rarity_Ultra Rare     13.797590
max_damage            11.294669
rarity_Double Rare     9.829171
stage_VMAX             9.712907
retreat_cost           9.504890
                        ...    
if                    -5.751410
10                    -6.288372
stage_Stage 1         -8.787541
turn                  -9.902094
stage_Basic          -12.576607
Length: 581, dtype: float64

predictions = model.predict(X_test_scaled)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print("--- Model Evaluation Results ---")
print(f"Mean Absolute Error (MAE): {mae} HP")
print(f"R-squared (R^2): {r2}")

--- Model Evaluation Results ---
Mean Absolute Error (MAE): 14.125096947960028 HP
R-squared (R^2): 0.9058198006025279

plt.style.use('ggplot')

plt.figure(figsize=(8, 6))
plt.scatter(y_test, predictions, alpha=0.7, edgecolors='w', s=100)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)

plt.xlabel("True HP")
plt.ylabel("Predicted HP")
plt.title("Model Performance: True vs. Predicted HP")
plt.show()

alpha_range = np.logspace(-3, 3, 20)

train_scores = []
test_scores = []

for alpha in alpha_range:
  model = Ridge(alpha=alpha)
  model.fit(X_train_scaled, y_train)
  
  y_train_pred = model.predict(X_train_scaled)
  y_test_pred = model.predict(X_test_scaled)
  
  train_scores.append(r2_score(y_train, y_train_pred))
  test_scores.append(r2_score(y_test, y_test_pred))

print("Best Alpha:", alpha_range[test_scores.index(max(test_scores))], "with R^2:", max(test_scores))

Best Alpha: 112.88378916846884 with R^2: 0.9084990186539924

plt.style.use('ggplot')
plt.figure(figsize=(10, 6))

plt.plot(alpha_range, train_scores, 'o-', color='blue', label='Training Score')
plt.plot(alpha_range, test_scores, 'o-', color='green', label='Testing Score (Validation)')

plt.xscale('log')

plt.xlabel('Alpha (Regularization Strength)')
plt.ylabel('R-squared Score')
plt.title('Validation Curve for Ridge Regression')
plt.legend()
plt.grid(True)
plt.show()

	pokemon_name	hp	type	stage	rarity	weakness	resistance	retreat_cost	attack_1_cost	attack_1_damage	...	attack_2_text	attack_3_cost	attack_3_damage	attack_3_text	attack_4_cost	attack_4_damage	attack_4_text	attack_5_cost	attack_5_damage	attack_5_text
0	Bulbasaur	40 HP	Grass	Basic	Common	Fire	No Resistance	1	Grass Grass	20	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	Bulbasaur	40 HP	Grass	Basic	Common	Fire	No Resistance	1	Grass Grass	20	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	Erika’s Bulbasaur	50 HP	Grass	Basic	Uncommon	Fire	No Resistance	1	Grass	10	...	Flip a coin. If heads, you may search your dec...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	Bulbasaur	40 HP	Grass	Basic	Common	Fire	No Resistance	1	Grass Grass	20	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	Bulbasaur	50 HP	Grass	Basic	Common	Fire	No Resistance	1	Colorless	10	...	Flip a coin. If heads, the Defending Pokémon i...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	pokemon_name	hp	type	stage	rarity	weakness	resistance	retreat_cost	all_attack_text	num_attacks	max_damage	total_damage	avg_damage	max_energy_cost	total_energy_cost	avg_energy_cost
0	Bulbasaur	40	Grass	Basic	Common	Fire	No Resistance	1	Unless all damage from this attack is prevente...	1	20.0	20.0	20.0	2	2	2.0
1	Bulbasaur	40	Grass	Basic	Common	Fire	No Resistance	1	Unless all damage from this attack is prevente...	1	20.0	20.0	20.0	2	2	2.0
2	Erika’s Bulbasaur	50	Grass	Basic	Uncommon	Fire	No Resistance	1	The Defending Pokémon is now Asleep. Flip a co...	1	10.0	10.0	10.0	2	3	3.0
3	Bulbasaur	40	Grass	Basic	Common	Fire	No Resistance	1	Unless all damage from this attack is prevente...	1	20.0	20.0	20.0	2	2	2.0
4	Bulbasaur	50	Grass	Basic	Common	Fire	No Resistance	1	Flip a coin. If heads, the Defending Pokémon ...	2	10.0	20.0	10.0	2	3	1.5

	retreat_cost	num_attacks	max_damage	total_damage	avg_damage	max_energy_cost	total_energy_cost	avg_energy_cost	type_Darkness	type_Dragon	...	you	your
0	1	1	20.0	20.0	20.0	2	2	2.0	False	False	...	0.166934	0.000000
1	1	1	20.0	20.0	20.0	2	2	2.0	False	False	...	0.166934	0.000000
2	1	1	10.0	10.0	10.0	2	3	3.0	False	False	...	0.130478	0.326134
3	1	1	20.0	20.0	20.0	2	2	2.0	False	False	...	0.166934	0.000000
4	1	2	10.0	20.0	10.0	2	3	1.5	False	False	...	0.000000	0.000000

	alpha	1.0
	fit_intercept	True
	copy_X	True
	max_iter	None
	tol	0.0001
	solver	'auto'
	positive	False
	random_state	None

Project 1: Predicting the Pokemon Card's Type from its Features¶

Scrapping the Data¶

Project Plan¶

Importing all Necessary Libraries and Packages¶

Data Preparation and Preprocessing¶

Load Data¶

Process and Clean Data¶

Identify and Separate Column Types¶

Split the Data for Training and Testing¶

Feature Scaling¶

Model Training¶

What the Model Learned¶

Model Evaluation¶

Visualize¶

Finding the Best Alpha For Our Model¶

Future Considerations¶

Scraping Process (Pains)¶