Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
ShallowSink
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
a272-jones
ShallowSink
Commits
2a334f4b
Commit
2a334f4b
authored
1 month ago
by
a272-jones
Browse files
Options
Downloads
Patches
Plain Diff
Finished? Symbolic Regressor, Fit with out system.
parent
f6c9f832
No related branches found
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
Working Models/symbolicRegressor.py
+172
-0
172 additions, 0 deletions
Working Models/symbolicRegressor.py
with
172 additions
and
0 deletions
Working Models/symbolicRegressor.py
0 → 100644
+
172
−
0
View file @
2a334f4b
import
os
import
sys
import
pandas
as
pd
import
numpy
as
np
import
re
from
gplearn.genetic
import
SymbolicRegressor
from
sklearn.model_selection
import
train_test_split
from
sklearn.metrics
import
mean_squared_error
,
r2_score
from
sklearn.preprocessing
import
StandardScaler
,
OneHotEncoder
,
LabelEncoder
from
sklearn.pipeline
import
Pipeline
from
sklearn.compose
import
ColumnTransformer
from
sklearn.impute
import
SimpleImputer
import
seaborn
as
sns
### IMPORTANT: When doing these models, be careful what is printed, as it will be stored as it's response.
# Load the data
project_root
=
os
.
path
.
dirname
(
os
.
path
.
dirname
(
__file__
))
file_path
=
f
"
{
sys
.
argv
[
2
]
}
"
data
=
pd
.
read_csv
(
file_path
)
# Will need to be changed to work with different csv files maybe ask user for their target column?
target_col
=
f
"
{
sys
.
argv
[
1
]
}
"
X
=
data
.
drop
(
target_col
,
axis
=
1
)
y
=
data
[
target_col
]
# Function to convert time periods to number of days
def
convert_time_period
(
value
):
if
pd
.
isna
(
value
):
return
np
.
nan
try
:
# Handle numeric values
if
isinstance
(
value
,
(
int
,
float
)):
return
value
# Convert string to lowercase for consistency
value
=
str
(
value
).
lower
()
# Extract number and unit
match
=
re
.
search
(
r
'
(\d+)\s*(\w+)
'
,
value
)
if
not
match
:
# Try to extract just a number
number_match
=
re
.
search
(
r
'
(\d+)
'
,
value
)
if
number_match
:
return
int
(
number_match
.
group
(
1
))
return
np
.
nan
number
=
int
(
match
.
group
(
1
))
unit
=
match
.
group
(
2
)
# Convert to days
if
'
day
'
in
unit
:
return
number
elif
'
week
'
in
unit
:
return
number
*
7
elif
'
month
'
in
unit
:
return
number
*
30
elif
'
year
'
in
unit
:
return
number
*
365
else
:
# If unit is not recognized, just return the number
return
number
except
Exception
as
e
:
print
(
f
"
Error converting
'
{
value
}
'
:
{
e
}
"
)
return
np
.
nan
# Categorize columns by data type
numeric_cols
=
X
.
select_dtypes
(
include
=
[
np
.
number
]).
columns
.
tolist
()
categorical_cols
=
X
.
select_dtypes
(
include
=
[
'
object
'
]).
columns
.
tolist
()
# Process each categorical column appropriately
for
col
in
categorical_cols
:
# First, fill missing values with a placeholder
missing_pct
=
X
[
col
].
isna
().
mean
()
*
100
# Check if column contains time periods (e.g., "5 months")
if
col
==
'
Injury_Prognosis
'
or
any
(
re
.
search
(
r
'
\d+\s*(?:day|week|month|year)
'
,
str
(
val
))
for
val
in
X
[
col
].
dropna
().
iloc
[:
20
]):
X
[
col
]
=
X
[
col
].
apply
(
convert_time_period
)
# Fill missing values with median after conversion
median_value
=
X
[
col
].
median
()
X
[
col
].
fillna
(
median_value
,
inplace
=
True
)
else
:
# For regular categorical variables, use label encoding with a special category for missing values
# First, fill NaN with a placeholder string
X
[
col
].
fillna
(
"
MISSING_VALUE
"
,
inplace
=
True
)
# Then apply label encoding
le
=
LabelEncoder
()
X
[
col
]
=
le
.
fit_transform
(
X
[
col
])
# Store mapping for reference
mapping
=
dict
(
zip
(
le
.
classes_
,
le
.
transform
(
le
.
classes_
)))
# Check for any remaining non-numeric columns
non_numeric_cols
=
X
.
select_dtypes
(
exclude
=
[
np
.
number
]).
columns
.
tolist
()
if
non_numeric_cols
:
# Drop any remaining non-numeric columns
X
=
X
.
drop
(
columns
=
non_numeric_cols
)
# Analyze missing values
missing_values
=
X
.
isna
().
sum
()
# Check for missing values in target column
target_missing
=
y
.
isna
().
sum
()
# Handle missing values with imputation instead of dropping
# For numerical columns
num_imputer
=
SimpleImputer
(
strategy
=
'
median
'
)
X_imputed
=
pd
.
DataFrame
(
num_imputer
.
fit_transform
(
X
),
columns
=
X
.
columns
)
# Handle missing values in target (if any)
if
target_missing
>
0
:
mask
=
y
.
notna
()
X_imputed
=
X_imputed
[
mask
]
y_clean
=
y
[
mask
]
else
:
y_clean
=
y
.
copy
()
# Redefine X with imputed data
X_clean
=
X_imputed
# Split the data
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X_clean
,
y_clean
,
test_size
=
0.2
,
random_state
=
42
)
# Scale the features
scaler
=
StandardScaler
()
X_train_scaled
=
scaler
.
fit_transform
(
X_train
)
X_test_scaled
=
scaler
.
transform
(
X_test
)
# Configure and training the model
# Training Symbolic Regressor
symbolic_reg
=
SymbolicRegressor
(
population_size
=
2000
,
generations
=
30
,
tournament_size
=
20
,
p_crossover
=
0.7
,
p_subtree_mutation
=
0.1
,
p_hoist_mutation
=
0.05
,
p_point_mutation
=
0.1
,
max_samples
=
0.8
,
verbose
=
0
,
parsimony_coefficient
=
0.05
,
random_state
=
42
,
function_set
=
(
'
add
'
,
'
sub
'
,
'
mul
'
,
'
div
'
,
'
sqrt
'
,
'
log
'
,
'
sin
'
,
'
cos
'
)
)
symbolic_reg
.
fit
(
X_train_scaled
,
y_train
)
# Make predictions
y_pred_train
=
symbolic_reg
.
predict
(
X_train_scaled
)
y_pred_test
=
symbolic_reg
.
predict
(
X_test_scaled
)
# Evaluate the model
train_rmse
=
np
.
sqrt
(
mean_squared_error
(
y_train
,
y_pred_train
))
test_rmse
=
np
.
sqrt
(
mean_squared_error
(
y_test
,
y_pred_test
))
train_r2
=
r2_score
(
y_train
,
y_pred_train
)
test_r2
=
r2_score
(
y_test
,
y_pred_test
)
### OUTPUTS
print
(
f
"
Train RMSE:
{
train_rmse
:
.
2
f
}
"
)
print
(
f
"
Test RMSE:
{
test_rmse
:
.
2
f
}
"
)
print
(
f
"
Train R² Score:
{
train_r2
:
.
4
f
}
"
)
print
(
f
"
Test R² Score:
{
test_r2
:
.
4
f
}
"
)
# Display the learned expression
print
(
"
\n
Best symbolic expression:
"
)
print
(
symbolic_reg
.
_program
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment