Skip to content

Commit efdc127

Browse files
authored
Merge pull request #1 from ByteBard58/upgrade_ML
New improvements over the original version
2 parents 3c258ee + 54349cd commit efdc127

File tree

10 files changed

+1211
-915
lines changed

10 files changed

+1211
-915
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,5 @@ scaler.pkl
2020
instance/
2121
shit/
2222
*.db
23-
.vscode/
23+
.vscode/
24+
*.pkl

app.py

Lines changed: 106 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,119 @@
1-
from flask import render_template,request,jsonify,Flask
2-
import numpy as np
1+
import os
32
import joblib
3+
import pandas as pd
4+
import numpy as np
5+
from flask import Flask, render_template, request, jsonify
6+
from fit import main
7+
8+
# --- Configuration ---
9+
MODEL_DIR = "models"
10+
PIPE_PATH = os.path.join(MODEL_DIR, "pipe.pkl")
11+
COLUMNS_PATH = os.path.join(MODEL_DIR, "column_names.pkl")
12+
reverse_mapping = {0: "FALSE POSITIVE", 1: "CANDIDATE", 2: "CONFIRMED"}
13+
14+
# --- Self-Heal Function ---
15+
def initialize_artifacts():
16+
"""
17+
Checks if model artifacts exist. If not, runs the training script.
18+
"""
19+
# 1. Ensure the model directory exists
20+
os.makedirs(MODEL_DIR, exist_ok=True)
21+
22+
# 2. Check for missing files
23+
pipe_exists = os.path.exists(PIPE_PATH)
24+
columns_exists = os.path.exists(COLUMNS_PATH)
25+
26+
if not pipe_exists or not columns_exists:
27+
print("--- MODEL ARTIFACTS MISSING ---")
28+
if not pipe_exists:
29+
print(f"Missing: {PIPE_PATH}")
30+
if not columns_exists:
31+
print(f"Missing: {COLUMNS_PATH}")
32+
33+
print("Running training routine (fit.main())... This may take a moment.")
34+
try:
35+
# Run the main training function from fit.py
36+
main()
37+
print("Training complete. Artifacts generated successfully.")
38+
print("---------------------------------")
39+
except Exception as e:
40+
print(f"\nFATAL: Error during self-heal training: {e}")
41+
print("Application cannot start without model artifacts.")
42+
print("Please fix the training script (fit.py) and restart.")
43+
exit(1) # Exit if training fails
44+
else:
45+
print("Model artifacts found. Loading...")
446

5-
model = joblib.load("model.pkl")
6-
scaler = joblib.load("scaler.pkl")
47+
# --- Application Startup ---
748

8-
reverse_mapping = {0:"FALSE POSITIVE",1:"CANDIDATE",2:"CONFIRMED"}
49+
# Run the self-heal check *before* loading models
50+
initialize_artifacts()
951

52+
# Load models
53+
try:
54+
pipe = joblib.load(PIPE_PATH)
55+
column_names = joblib.load(COLUMNS_PATH)
56+
print("Models loaded successfully.")
57+
except Exception as e:
58+
print(f"\nFATAL: Error loading model artifacts: {e}")
59+
print("Files might be corrupt. Try deleting the 'models' directory and restarting.")
60+
exit(1) # Exit if loading fails
61+
62+
# Initialize Flask App
1063
app = Flask(__name__)
1164

1265
@app.route("/")
1366
def home():
14-
return render_template("index.html")
15-
16-
@app.route("/predict",methods=["POST"])
17-
def predict():
18-
try:
19-
data = request.json["features"]
20-
arr = np.array(data).reshape(1,-1)
21-
arr_scaled = scaler.transform(arr)
22-
pred = model.predict(arr_scaled)[0]
23-
proba_pred = model.predict_proba(arr_scaled)[0]
24-
proba_dict = {reverse_mapping[i]: round(p,3) for i,p in enumerate(proba_pred)}
25-
return jsonify({"prediction":reverse_mapping[pred],"probabilities":proba_dict})
26-
except Exception as e:
27-
return jsonify({"error":e})
67+
return render_template("index.html")
2868

2969
@app.route("/about")
3070
def about():
31-
return render_template("about.html")
71+
return render_template("about.html")
72+
73+
@app.route("/predict", methods=["POST"])
74+
def predict():
75+
try:
76+
# Extract features from the JSON request
77+
raw_features = [
78+
request.json["orbital-period"],
79+
request.json["transit-epoch"],
80+
request.json["transit-depth"],
81+
request.json["planet-radius"],
82+
request.json["semi-major-axis"],
83+
request.json["inclination"],
84+
request.json["equilibrium-temp"],
85+
request.json["insolation-flux"],
86+
request.json["impact-parameter"],
87+
request.json["radius-ratio"],
88+
request.json["stellar-density"],
89+
request.json["star-distance"],
90+
request.json["num-transits"],
91+
]
92+
93+
# Create DataFrame with correct column names
94+
df = pd.DataFrame([raw_features], columns=column_names)
95+
96+
# Get prediction and probabilities
97+
pred = int(pipe.predict(df)[0])
98+
proba = pipe.predict_proba(df)[0]
99+
100+
# Format probabilities for the response
101+
proba_dict = {
102+
reverse_mapping[i]: round(p, 3) for i, p in enumerate(proba)
103+
}
104+
105+
# Send response
106+
return jsonify(
107+
{"prediction": reverse_mapping[pred], "probabilities": proba_dict}
108+
)
109+
110+
except KeyError as e:
111+
print(f"Prediction Error: Missing key in request {e}")
112+
return jsonify({"error": f"Missing feature in request: {e}"}), 400
113+
except Exception as e:
114+
print(f"Prediction Error: {e}")
115+
return jsonify({"error": str(e)}), 400
116+
32117

33118
if __name__ == "__main__":
34-
app.run(debug=True)
119+
app.run(debug=True)

fit.py

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
import time
2+
import pandas as pd
3+
import numpy as np
4+
import joblib
5+
6+
from sklearn.model_selection import train_test_split
7+
from sklearn.preprocessing import StandardScaler
8+
from sklearn.impute import SimpleImputer
9+
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
10+
from sklearn.linear_model import LogisticRegression
11+
from xgboost import XGBClassifier
12+
from imblearn.over_sampling import SMOTE
13+
from imblearn.pipeline import Pipeline
14+
from sklearn.metrics import classification_report
15+
16+
17+
def get_window(camps, campaign_dates):
18+
if pd.isna(camps) or not camps:
19+
return np.nan, np.nan
20+
21+
camps = str(camps).split(',') if isinstance(camps, str) else camps
22+
starts, ends = [], []
23+
24+
for c in camps:
25+
try:
26+
camp_num = int(c.strip())
27+
if camp_num in campaign_dates:
28+
start, end = campaign_dates[camp_num]
29+
starts.append(start)
30+
ends.append(end)
31+
except (ValueError, KeyError):
32+
continue
33+
34+
return (min(starts) if starts else np.nan, max(ends) if ends else np.nan)
35+
36+
37+
def load_and_prepare_data():
38+
# Load Kepler dataset
39+
df_raw = pd.read_csv("data/kepler_data.csv", comment="#")
40+
feature_list = [
41+
"koi_disposition", "koi_period", "koi_time0bk", "koi_depth", "koi_prad",
42+
"koi_sma", "koi_incl", "koi_teq", "koi_insol", "koi_impact",
43+
"koi_ror", "koi_srho", "koi_dor", "koi_num_transits"
44+
]
45+
df_1 = df_raw[feature_list].copy()
46+
47+
# Load K2 dataset
48+
df_2 = pd.read_csv("data/k2_data.csv", comment="#")
49+
50+
# Define campaign windows
51+
campaign_dates = {
52+
0: (2456725.0, 2456805.0), 1: (2456808.0, 2456891.0), 2: (2456893.0, 2456975.0),
53+
3: (2456976.0, 2457064.0), 4: (2457065.0, 2457159.0), 5: (2457159.0, 2457246.0),
54+
6: (2457250.0, 2457338.0), 7: (2457339.0, 2457420.0), 8: (2457421.0, 2457530.0),
55+
9: (2457504.0, 2457579.0), 10: (2457577.0, 2457653.0), 11: (2457657.0, 2457732.0),
56+
12: (2457731.0, 2457819.0), 13: (2457820.0, 2457900.0), 14: (2457898.0, 2457942.0),
57+
15: (2457941.0, 2458022.0), 16: (2458020.0, 2458074.0), 17: (2458074.0, 2458176.0),
58+
18: (2458151.0, 2458201.0), 19: (2458232.0, 2458348.0)
59+
}
60+
61+
# Add observation window
62+
df_2['campaigns'] = df_2['k2_campaigns']
63+
df_2[['obs_start_bjd', 'obs_end_bjd']] = df_2['campaigns'].apply(
64+
lambda x: pd.Series(get_window(x, campaign_dates))
65+
)
66+
67+
# Transit counting
68+
df_2['n_min'] = np.ceil((df_2['obs_start_bjd'] - df_2['pl_tranmid']) / df_2['pl_orbper'])
69+
df_2['n_max'] = np.floor((df_2['obs_end_bjd'] - df_2['pl_tranmid']) / df_2['pl_orbper'])
70+
df_2['num_transits'] = (df_2['n_max'] - df_2['n_min'] + 1).clip(lower=0)
71+
72+
# Select and rename columns
73+
df_2 = df_2[
74+
["disposition", "pl_orbper", "pl_tranmid", "pl_trandep", "pl_rade",
75+
"pl_orbsmax", "pl_orbincl", "pl_eqt", "pl_insol", "pl_imppar",
76+
"pl_ratror", "pl_dens", "pl_ratdor", "num_transits"]
77+
]
78+
79+
mapping = {
80+
"disposition": "koi_disposition", "pl_orbper": "koi_period", "pl_tranmid": "koi_time0bk",
81+
"pl_trandep": "koi_depth", "pl_rade": "koi_prad", "pl_orbsmax": "koi_sma",
82+
"pl_orbincl": "koi_incl", "pl_eqt": "koi_teq", "pl_insol": "koi_insol",
83+
"pl_imppar": "koi_impact", "pl_ratror": "koi_ror", "pl_dens": "koi_srho",
84+
"pl_ratdor": "koi_dor", "num_transits": "koi_num_transits"
85+
}
86+
df_2 = df_2.rename(columns=mapping)
87+
88+
# Combine both datasets
89+
df = pd.concat([df_1, df_2])
90+
91+
# Prepare input/output
92+
X = df.iloc[:, 1:].to_numpy()
93+
y = df["koi_disposition"].map({
94+
"FALSE POSITIVE": 0, "CANDIDATE": 1, "CONFIRMED": 2, "REFUTED": 0
95+
}).to_numpy()
96+
97+
return X, y, df.columns[1:]
98+
99+
100+
def build_pipeline():
101+
rf = RandomForestClassifier(
102+
n_estimators=1000, max_depth=None, random_state=542, class_weight="balanced"
103+
)
104+
xgb = XGBClassifier(
105+
n_estimators=1000, max_depth=None, learning_rate=0.5, random_state=9
106+
)
107+
estimators = [("rf", rf), ("xgb", xgb)]
108+
109+
final_estimator = LogisticRegression(
110+
random_state=891, class_weight="balanced", C=0.1,
111+
penalty="l2", solver="saga", max_iter=5000
112+
)
113+
114+
mv = StackingClassifier(
115+
estimators=estimators, final_estimator=final_estimator,
116+
cv=5, passthrough=True, n_jobs=-1
117+
)
118+
119+
pipe = Pipeline([
120+
("impute", SimpleImputer(strategy="mean")),
121+
("scale", StandardScaler()),
122+
("smote", SMOTE()),
123+
("model", mv)
124+
])
125+
return pipe
126+
127+
def eval(y_test,x_test,estimator):
128+
y_true = y_test
129+
y_pred = estimator.predict(x_test)
130+
return classification_report(y_true,y_pred)
131+
132+
def main():
133+
X, y, column_name = load_and_prepare_data()
134+
135+
x_train, x_test, y_train, y_test = train_test_split(
136+
X, y, test_size=1/3, shuffle=True, random_state=91, stratify=y
137+
)
138+
139+
pipe_mv = build_pipeline()
140+
141+
print("Starting model training. It will take some time, sit tight......")
142+
t1 = time.time()
143+
pipe_mv.fit(x_train, y_train)
144+
t2 = time.time()
145+
146+
print("Model trained successfully")
147+
minutes, seconds = np.divmod(t2 - t1, 60)
148+
print(f"Time Elapsed: {minutes:.0f} M {seconds:.2f} S")
149+
150+
151+
print(eval(y_test,x_test,pipe_mv))
152+
153+
joblib.dump(pipe_mv, "models/pipe.pkl")
154+
joblib.dump(column_name, "models/column_names.pkl")
155+
print("Model and column names saved successfully.")
156+
157+
158+
if __name__ == "__main__":
159+
main()

0 commit comments

Comments
 (0)