hbp5181
/

BindPred

@@ -3,92 +3,154 @@ import pandas as pd
 import numpy as np
 from sklearn.model_selection import KFold
 from sklearn.metrics import mean_squared_error, r2_score
-from scipy.stats import pearsonr, ttest_ind
 from catboost import CatBoostRegressor
-# Load dataset, this should be specified for which model will be trained(eg., embedding only or including physical terms)
-data = pd.read_csv("embeddings/ESM2_interaction.csv")
-# Fill missing feature strings (Features are chosen based on what kind of mdoel will be trained.
-# Ligand and Receptor Features are ESM2 embeddings and Physical Features are PyRosetta Features
-for col in ["Ligand Features", "Receptor Features", "Physical Features"]:
-    data[col] = data[col].fillna("")
-# Parse comma-separated floats
-for col in ["Ligand Features", "Receptor Features", "Physical Features"]:
-    data[col] = data[col].apply(
-        lambda s: [float(x) for x in str(s).split(",") if x.strip()]
     )
-# Build feature arrays
-X_ligand = np.vstack(data["Ligand Features"].values)
-X_receptor = np.vstack(data["Receptor Features"].values)
-# optional: X_physical = np.vstack(data["Physical Features"].values)
-# Convert KD(M) into log10 scale
-raw_y = data["KD(M)"].values
-y = np.log10(raw_y)  # assumes all KD values are positive
 records = []
-# Repeat 5×5-fold CV, with and without physical features
-for repeat in range(1, 6):
-    kf = KFold(n_splits=5, shuffle=True, random_state=repeat)
-    for include_phys in (False, True):
-        X_base = np.hstack([X_ligand, X_receptor])
-        X_full = np.hstack([X_base, X_physical])
-        X_data = X_full if include_phys else X_base
-        for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X_data), start=1):
-            X_train, X_test = X_data[train_idx], X_data[test_idx]
-            y_train, y_test = y[train_idx], y[test_idx]
-            # Initialize with your chosen hyperparameters and GPU support
-            model = CatBoostRegressor(
-                iterations=2000,
-                learning_rate=0.08,
-                depth=4,
-                verbose=500,
-                task_type="GPU",
-                devices="0"
-            )
-            # Train and time this fold
-            model.fit(X_train, y_train)
-            preds = model.predict(X_test)
-            rmse = np.sqrt(mean_squared_error(y_test, preds))
-            r2   = r2_score(y_test, preds)
-            pcc  = pearsonr(y_test, preds)[0]
-            records.append({
-                "repeat": repeat,
-                "fold": fold_idx,
-                "with_physical": include_phys,
-                "pearson_r": pcc,
-                "r2": r2,
-                "rmse": rmse
-            })
-# Aggregate metrics
-metrics_df = pd.DataFrame(records)
-# Save to CSV
-out_dir = "metrics"
-os.makedirs(out_dir, exist_ok=True)
-csv_path = os.path.join(out_dir, "InteractionMetrics.csv")
-metrics_df.to_csv(csv_path, index=False)
-print(f"All metrics saved to {csv_path}")
-# Conduct independent t tests for each metric
-results = {}
-for metric in ["pearson_r", "r2", "rmse"]:
-    grp_with = metrics_df.loc[metrics_df.with_physical, metric]
-    grp_without = metrics_df.loc[~metrics_df.with_physical, metric]
-    t_stat, p_val = ttest_ind(grp_with, grp_without, equal_var=False)
-    results[metric] = (t_stat, p_val)
-print("\nT test results comparing with vs without physical features:")
-for m, (t_stat, p_val) in results.items():
-    print(f"{m} → t = {t_stat:.3f}, p = {p_val:.3f}")

 import numpy as np
 from sklearn.model_selection import KFold
 from sklearn.metrics import mean_squared_error, r2_score
+from scipy.stats import pearsonr, spearmanr
 from catboost import CatBoostRegressor
+import matplotlib.pyplot as plt
+# Set publication-style fonts
+plt.rcParams.update({
+    'font.family': 'serif',
+    'font.size': 13,
+    'axes.labelsize': 14,
+    'axes.titlesize': 14,
+    'xtick.labelsize': 12,
+    'ytick.labelsize': 12,
+    'legend.fontsize': 12
+})
+# Load dataset
+data = pd.read_csv("/storage/group/cdm8/default/BindPred/embeddings/Seq_Gen_updated.csv")
+# Handle missing values
+data['Ligand Features'] = data['Ligand Features'].fillna('')
+data['Receptor Features'] = data['Receptor Features'].fillna('')
+# Convert embedding strings to float lists
+data['Ligand Features'] = data['Ligand Features'].apply(
+    lambda x: [float(i) for i in str(x).split(',') if i.strip()] if isinstance(x, str) else []
+)
+data['Receptor Features'] = data['Receptor Features'].apply(
+    lambda x: [float(i) for i in str(x).split(',') if i.strip()] if isinstance(x, str) else []
+)
+# Combine embeddings
+data['Combined Features'] = data.apply(
+    lambda row: np.concatenate((row['Ligand Features'], row['Receptor Features']))
+    if len(row['Ligand Features']) > 0 and len(row['Receptor Features']) > 0 else np.array([]),
+    axis=1
+)
+# Filter valid rows
+data = data[data['Combined Features'].apply(len) > 0]
+# Check KD(M) column
+if "KD(M)" not in data.columns or data["KD(M)"].isnull().any():
+    raise ValueError("Missing or NaN values in 'KD(M)' column.")
+# Prepare features and log-transformed labels
+X = np.vstack(data['Combined Features'])
+y = np.log10(data['KD(M)'])
+# Cross-validation
+kf = KFold(n_splits=5, shuffle=True, random_state=42)
+all_y_true = []
+all_y_pred = []
+test_indices_all = []
+# Output directory
+output_dir = "new_plt"
+os.makedirs(output_dir, exist_ok=True)
+for fold, (train_index, test_index) in enumerate(kf.split(X)):
+    X_train, X_test = X[train_index], X[test_index]
+    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
+    model = CatBoostRegressor(
+        iterations=2000,
+        learning_rate=0.08,
+        depth=4,
+        verbose=500,
+        task_type="GPU",
+        devices='0'
     )
+    model.fit(X_train, y_train)
+    y_pred = model.predict(X_test)
+    all_y_true.extend(y_test)
+    all_y_pred.extend(y_pred)
+    test_indices_all.extend(test_index)
+# Convert predictions to arrays
+all_y_true = np.array(all_y_true)
+all_y_pred = np.array(all_y_pred)
+# Compute performance metrics
+pcc, _ = pearsonr(all_y_true, all_y_pred)
+srcc, _ = spearmanr(all_y_true, all_y_pred)
+rmse = np.sqrt(mean_squared_error(all_y_true, all_y_pred))
+r2 = r2_score(all_y_true, all_y_pred)
+# Compute absolute error
+errors = np.abs(all_y_true - all_y_pred)
+# Plotting
+plt.figure(figsize=(5, 5))
+plt.title("ESM2 Embeddings", fontsize=15, pad=10)
+sc = plt.scatter(
+    all_y_true,
+    all_y_pred,
+    s=25,
+    c=errors,
+    cmap='Reds',
+    alpha=0.9,
+    edgecolors='black',
+    linewidth=0.4,
+    marker='^'  # triangle markers
+)
+# Diagonal reference line
+plt.plot([-15, -2], [-15, -2], color='black', linestyle='--', linewidth=1)
+# Axis setup
+plt.xlabel("Experimental Log10(Kd)", fontsize=14, labelpad=10)
+plt.ylabel("BindPred Prediction of Log10(Kd)", fontsize=14, labelpad=10)
+plt.xlim(-15.0, -2.0)
+plt.ylim(-15.0, -2.0)
+plt.gca().set_aspect('equal', adjustable='box')
+# Metrics box
+plt.text(0.05, 0.95,
+         f"PCC: {pcc:.3f}\nRMSE: {rmse:.3f}\nR²: {r2:.3f}",
+         transform=plt.gca().transAxes,
+         fontsize=12,
+         verticalalignment='top',
+         horizontalalignment='left',
+         bbox=dict(facecolor='white', edgecolor='gray', boxstyle='round,pad=0.3'))
+# Colorbar
+cbar = plt.colorbar(sc)
+cbar.set_label("Absolute Error", fontsize=12)
+# Save plot
+plt.tight_layout()
+plt.savefig(os.path.join(output_dir, 'esm2_plot.png'), dpi=700)
+plt.savefig(os.path.join(output_dir, 'esm2_plot.pdf'), dpi=700)
+plt.show()
+# Save prediction results to CSV
 records = []
+for idx, test_idx in enumerate(test_indices_all):
+    row = data.iloc[test_idx]
+    record = {
+        "PDB_ID": row.get("PDB_ID", "NA"),
+        "Mutation": row.get("Mutation", "NA"),
+        "Actual_log10Kd": all_y_true[idx],
+        "Predicted_log10Kd": all_y_pred[idx]
+    }
+    records.append(record)
+df_preds = pd.DataFrame(records)
+csv_path = os.path.join(output_dir, "ESM2_predictions.csv")
+df_preds.to_csv(csv_path, index=False)
+print(f"Saved prediction results to {csv_path}")