# Makine Ogrenmesine Giris

In [None]:
import uproot
import numpy as np
import pandas as pd
import awkward as ak
import tensorflow as tf
import matplotlib.pyplot as plt

## Z ve TTBar olaylarini bir birinden ayiran bir basit model yapacagiz
- Once tree'leri alacagiz.
- Biraz kinemtik degiskenlerimize bakacagiz.
- Makine ogrenmesi icin girdi verisetimizi olusturacagiz.
- Keras kullanarak dense layerlardan olusan bir model olusturacagiz.
- Modelin sonuclarini inceleyecegiz
- Hyper parameter tuning uygulayacagiz
- cross validation ile en iyi parametre setini kesfedecegiz.
- surpriz ozellik

In [None]:
z_tree = uproot.open('Zll.root:Delphes')
z_tree

In [None]:
ttbar_tree = uproot.open('ttbar.root:Delphes')
ttbar_tree

In [None]:
#z_tree.keys()

In [None]:
variables = [
    "Muon/Muon.PT", "Muon/Muon.Eta", "Muon/Muon.Phi",
    "Jet/Jet.PT", "Jet/Jet.Eta", "Jet/Jet.Phi",
    "MissingET/MissingET.MET"
]


In [None]:
jet_size = z_tree["Jet_size"].array()
jet_size_mask = jet_size == 2
jet_pt = z_tree["Jet.PT"].array()

In [None]:
plt.hist( jet_pt[jet_size_mask], label=['Leading jet', 'Second leading jet'], bins=30, range=(0,150) )
plt.xlim([0,150])
plt.xlabel("Jet $p_{T}$ [GeV]")
plt.ylabel("Number of jets / 5 GeV")
plt.legend()
plt.show()

In [None]:
muon_size = z_tree["Muon_size"].array()
muon_size_mask = muon_size == 2
muon_pt = z_tree["Muon.PT"].array()

In [None]:
plt.hist( muon_pt[muon_size_mask], label=['Leading muon', 'Second leading muon'], bins=30, range=(0,150) )
plt.xlim([0,150])
plt.xlabel("Muon $p_{T}$ [GeV]")
plt.ylabel("Number of muons / 5 GeV")
plt.legend()
plt.show()

In [None]:
z_met = z_tree["MissingET.MET"].array()
tt_met = ttbar_tree["MissingET.MET"].array()

plt.hist(z_met, bins=30, range=(0, 150), alpha=0.5, label="Z → LL", color="blue", density=True)
plt.hist(tt_met, bins=30, range=(0, 150), alpha=0.5, label="tt̄", color="red", density=True)

plt.xlim([0, 150])
plt.xlabel("MET $p_{T}$ [GeV]")
plt.ylabel("Density")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
z_data_dict = z_tree.arrays(variables, library="np")  # NumPy array formatında oku
z_df = pd.DataFrame({var: z_data_dict[var] for var in variables})  # DataFrame'e çevir
z_df["label"] = 1  # Sınıf etiketini ekle

In [None]:
tt_data_dict = ttbar_tree.arrays(variables, library="np")  # NumPy array formatında oku
tt_df = pd.DataFrame({var: tt_data_dict[var] for var in variables})  # DataFrame'e çevir
tt_df["label"] = 0  # Sınıf etiketini ekle

In [None]:
z_df.head()

In [None]:
tt_df.head()

In [None]:
# İki DataFrame'i birleştir
df = pd.concat([z_df, tt_df], ignore_index=True)

# Verinin ilk 5 satırına bakalım
print(df.head())

In [None]:
#Eksik Değerleri ve Veriyi Temizleme

In [None]:
# Eğer branch'ler listeler içeriyorsa, ilk elemanı al (örneğin bir olayda birden fazla muon olabilir)
for col in variables:
    df[col] = df[col].apply(lambda x: x[0] if isinstance(x, np.ndarray) and len(x) > 0 else np.nan)

# Eksik verileri içeren satırları temizle
df.dropna(inplace=True)

# Verinin son haline bakalım
print(df.head())


In [None]:
from sklearn.preprocessing import StandardScaler

# Özellikleri ve etiketleri ayır
X = df.drop(columns=["label"])
y = df["label"]


In [None]:
# Standardizasyon
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Veriyi tekrar DataFrame'e çevir
X = pd.DataFrame(X_scaled, columns=X.columns)

print(X.head())

In [None]:
from sklearn.model_selection import train_test_split

# Veriyi %80 eğitim, %20 test olarak ayır
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Eğitim verisi boyutu: {X_train.shape}")
print(f"Test verisi boyutu: {X_test.shape}")


In [None]:
from tensorflow import keras
from tensorflow.keras import layers

# Modeli oluştur
model = keras.Sequential([
    layers.Dense(32, activation="relu", input_shape=(X_train.shape[1],)),  # İlk katman
    layers.Dense(16, activation="relu"),  # Orta katman
    layers.Dense(1, activation="sigmoid")  # Çıkış katmanı (binary classification)
])

# Modeli derle
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Modelin özetini yazdır
model.summary()


In [None]:
# Modeli eğit
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))


In [None]:
# Test veri seti üzerinde modelin doğruluğunu ölç
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Doğruluğu: {test_acc:.4f}")


In [None]:
import matplotlib.pyplot as plt

# Kayıp fonksiyonu grafiği
plt.plot(history.history["loss"], label="Eğitim Kaybı")
plt.plot(history.history["val_loss"], label="Doğrulama Kaybı")
plt.xlabel("Epoch")
plt.ylabel("Kayıp")
plt.legend()
plt.title("Kayıp Grafiği")
plt.show()

# Doğruluk grafiği
plt.plot(history.history["accuracy"], label="Eğitim Doğruluğu")
plt.plot(history.history["val_accuracy"], label="Doğrulama Doğruluğu")
plt.xlabel("Epoch")
plt.ylabel("Doğruluk")
plt.legend()
plt.title("Doğruluk Grafiği")
plt.show()


In [None]:
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report

# Modelin test seti üzerindeki tahminlerini al
y_pred_prob = model.predict(X_test).flatten()  # Sigmoid çıktısı olduğu için düzleştiriyoruz
y_pred = (y_pred_prob > 0.5).astype(int)  # 0.5 eşik değeriyle sınıflandırıyoruz

# --- ROC Eğrisi ---
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC Curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="gray", lw=1, linestyle="--")  # Rastgele sınıflandırma çizgisi
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Eğrisi")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

# --- Confusion Matrix ---
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(5, 4))
plt.imshow(cm, interpolation="nearest", cmap="Blues")
plt.title("Confusion Matrix")
plt.colorbar()

# X ve Y eksenlerine sınıf isimlerini ekleyelim
classes = ["Z → LL", "tt̄"]
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes)
plt.yticks(tick_marks, classes)

# Hücrelerin içine sayıları yazdıralım
for i in range(len(classes)):
    for j in range(len(classes)):
        plt.text(j, i, str(cm[i, j]), ha="center", va="center", color="black", fontsize=14)

plt.xlabel("Tahmin Edilen")
plt.ylabel("Gerçek")
plt.show()

# --- Classification Report ---
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=classes))


In [None]:
#Overfitting’i Önlemek (Dropout & Batch Normalization)

In [None]:
# Yeni Model Tanımlama (Dropout & Batch Normalization Eklenmiş)
model2 = keras.Sequential([
    layers.Dense(32, activation="relu", input_shape=(X_train.shape[1],)),
    layers.BatchNormalization(),  # Batch Normalization ekledik
    layers.Dropout(0.3),  # %30 Dropout ekledik
    
    layers.Dense(16, activation="relu"),
    layers.BatchNormalization(),
    layers.Dropout(0.3),

    layers.Dense(1, activation="sigmoid")  # Çıkış katmanı
])

# Modeli derle
model2.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Modelin özetini yazdır
model2.summary()

# Modeli eğit
history2 = model2.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))


In [None]:
#Yeni Feature’lar Türeterek Modeli Güçlendirme

In [None]:
df["Total_Jet_PT"] = df["Jet/Jet.PT"]
df["Muon_Jet_DeltaPhi"] = abs(df["Muon/Muon.Phi"] - df["Jet/Jet.Phi"])
df["Scalar_Sum_PT"] = df["Muon/Muon.PT"] + df["Jet/Jet.PT"] + df["MissingET/MissingET.MET"]

# Yeni feature'ları ekleyerek tekrar normalize et
X = df.drop(columns=["label"])
y = df["label"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

# Yeni veriyi tekrar eğitim-test olarak böl
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Random Forest Modeli Eğitme
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Random Forest Modelinin Tahminleri
y_pred_rf = rf_model.predict(X_test)
rf_acc = accuracy_score(y_test, y_pred_rf)

print(f"Random Forest Test Doğruluğu: {rf_acc:.4f}")


In [None]:
# Neural Network’in yeni doğruluğunu alalım
test_loss, test_acc = model2.evaluate(X_test, y_test)

print(f"Neural Network Test Doğruluğu: {test_acc:.4f}")
print(f"Random Forest Test Doğruluğu: {rf_acc:.4f}")


In [None]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV
from tensorflow.keras.optimizers import Adam

# Modeli tanımlayan fonksiyon
def build_model(units1=32, units2=16, dropout_rate=0.2, learning_rate=0.001):
    model = keras.Sequential([
        layers.Dense(units1, activation="relu", input_shape=(X_train.shape[1],)),
        layers.BatchNormalization(),
        layers.Dropout(dropout_rate),
        
        layers.Dense(units2, activation="relu"),
        layers.BatchNormalization(),
        layers.Dropout(dropout_rate),

        layers.Dense(1, activation="sigmoid")  # Çıkış katmanı (Binary Classification)
    ])
    
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"])
    
    return model

# Sklearn için Keras modelini tanımlayalım
model = KerasClassifier(build_fn=build_model, verbose=0)

# Hyperparameter grid
param_grid = {
    "units1": [16, 32, 64],  # İlk katman nöronları
    "units2": [8, 16, 32],  # İkinci katman nöronları
    "dropout_rate": [0.2, 0.3, 0.4],  # Dropout oranı
    "learning_rate": [0.001, 0.01, 0.1],  # Öğrenme oranı
    "batch_size": [16, 32, 64],  # Batch size
    "epochs": [10, 20]  # Epoch sayısı
}

# Randomized Search ile en iyi hyperparametreleri bul
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid,
                                   n_iter=10, cv=3, verbose=1, n_jobs=-1)

# Eğitimi başlat
random_search.fit(X_train, y_train)

# En iyi parametreleri yazdıralım
print("En iyi parametreler:", random_search.best_params_)


In [None]:
from sklearn.model_selection import GridSearchCV

# Daha dar bir hyperparameter grid seçelim
param_grid = {
    "units1": [random_search.best_params_["units1"] - 16, random_search.best_params_["units1"]],
    "units2": [random_search.best_params_["units2"] - 8, random_search.best_params_["units2"]],
    "dropout_rate": [random_search.best_params_["dropout_rate"] - 0.1, random_search.best_params_["dropout_rate"]],
    "learning_rate": [random_search.best_params_["learning_rate"] / 2, random_search.best_params_["learning_rate"]],
    "batch_size": [random_search.best_params_["batch_size"]],
    "epochs": [random_search.best_params_["epochs"]]
}

# Grid Search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# En iyi parametreleri yazdır
print("En iyi Grid Search parametreleri:", grid_search.best_params_)


In [None]:
from sklearn.model_selection import cross_val_score

# En iyi parametrelerle model oluştur
best_model = build_model(
    units1=grid_search.best_params_["units1"],
    units2=grid_search.best_params_["units2"],
    dropout_rate=grid_search.best_params_["dropout_rate"],
    learning_rate=grid_search.best_params_["learning_rate"]
)

# Keras modeli sklearn uyumlu hale getirelim
best_model_sklearn = KerasClassifier(build_fn=lambda: best_model, batch_size=grid_search.best_params_["batch_size"],
                                     epochs=grid_search.best_params_["epochs"], verbose=0)

# 5-Fold Cross Validation
cv_scores = cross_val_score(best_model_sklearn, X_train, y_train, cv=5, scoring="accuracy")

# Sonuçları yazdır
print(f"Cross Validation Sonuçları: {cv_scores}")
print(f"Ortalama Doğruluk: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")


In [None]:
# En iyi modeli eğit
best_model.fit(X_train, y_train, epochs=grid_search.best_params_["epochs"], batch_size=grid_search.best_params_["batch_size"], verbose=1)

# Test seti üzerinde modelin doğruluğunu ölç
test_loss, test_acc = best_model.evaluate(X_test, y_test)
print(f"Test Doğruluğu: {test_acc:.4f}")


In [None]:
import matplotlib.pyplot as plt

# Sınıfları ayır
df_zll = df[df["label"] == 0]  # Z → LL olayları
df_ttbar = df[df["label"] == 1]  # tt̄ olayları

# Çizilecek değişkenler (label hariç)
features = df.columns[:-1]

# Her değişken için histogram çizelim
for feature in features:
    plt.figure(figsize=(6, 4))
    plt.hist(df_zll[feature], bins=30, alpha=0.5, label="Z → LL", color="blue", density=True)
    plt.hist(df_ttbar[feature], bins=30, alpha=0.5, label="tt̄", color="red", density=True)
    plt.xlabel(feature)
    plt.ylabel("Yoğunluk")
    plt.title(f"{feature} Dağılımı")
    plt.legend()
    plt.grid(True)
    plt.show()


In [None]:
# Modeli kaydet (H5 formatında)
# KerasClassifier içindeki gerçek modeli kaydet
best_model.save("particle_classifier.h5")


In [None]:
# Modeli tekrar yükleyelim ve test edelim
from tensorflow.keras.models import load_model

loaded_model = load_model("particle_classifier.h5")

# Test seti üzerinde modeli tekrar değerlendirelim
test_loss, test_acc = loaded_model.evaluate(X_test, y_test)
print(f"Yüklenen Model Test Doğruluğu: {test_acc:.4f}")


In [None]:
# Get model predictions as probabilities
y_pred_probs = loaded_model.predict(X_test).flatten()  # Convert to 1D array

# Separate predictions for each class
zll_probs = y_pred_probs[y_test == 1]  # Probabilities for Z → LL class
ttbar_probs = y_pred_probs[y_test == 0]  # Probabilities for tt̄ class


In [None]:
# Plot histograms of predicted probabilities for each class
plt.figure(figsize=(6, 4))
plt.hist(zll_probs, bins=30, alpha=0.5, label="Z → LL", color="blue", density=True)
plt.hist(ttbar_probs, bins=30, alpha=0.5, label="tt̄", color="red", density=True)
plt.xlabel("Predicted Probability")
plt.ylabel("Density")
plt.title("Class Probability Distributions")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Load the saved model (if not already loaded)
model = loaded_model

# Convert X_test to a TensorFlow tensor
input_data = tf.convert_to_tensor(X_test, dtype=tf.float32)

# Get feature names
feature_names = X_test.columns.tolist()

# Initialize arrays for storing gradients
n_events = len(X_test)
n_features = len(feature_names)
mean_gradients = np.zeros(n_features)
all_gradients = []

# Compute gradients for each event
for i, event in enumerate(X_test.values):
    with tf.GradientTape() as tape:
        inputs = tf.Variable([event], dtype=tf.float32)
        tape.watch(inputs)
        prediction = model(inputs)[:, 0]  # Output probability for class 1 (tt̄)
    
    # Compute absolute gradients
    gradients = tf.abs(tape.gradient(prediction, inputs))
    numpy_gradients = gradients.numpy()[0]  # Convert to NumPy array
    all_gradients.append(numpy_gradients)
    
    # Accumulate absolute gradients for feature importance
    mean_gradients += np.abs(numpy_gradients) / n_events

# Convert to a NumPy array for visualization
feature_importance = np.array(mean_gradients)

# Plot feature importance
plt.figure(figsize=(8, 5))
plt.barh(feature_names, feature_importance, color="blue", alpha=0.7)
plt.xlabel("Mean Absolute Gradient")
plt.ylabel("Feature")
plt.title("Feature Importance (Gradient-based)")
plt.gca().invert_yaxis()  # Invert y-axis for better visualization
plt.grid(True)
plt.show()


## Btag identification yapalim 

In [None]:
# Define relevant branches for jet clustering
jet_branches = ["Jet/Jet.PT", "Jet/Jet.Eta", "Jet/Jet.Phi", "Jet/Jet.Mass" , "Jet/Jet.BTag"]
# Define relevant branches for b-jet tagging

In [None]:
jet_data = ttbar_tree.arrays(jet_branches, library="np")

In [None]:
# Convert jet data to DataFrame
df_jets = pd.DataFrame({var: jet_data[var] for var in jet_branches})

In [None]:
print("Jet Data:")
print(df_jets.head())

In [None]:
# Convert ROOT arrays to DataFrame, selecting only the first jet per event
df_jets = pd.DataFrame({
    var: [x[0] if isinstance(x, np.ndarray) and len(x) > 0 else np.nan for x in jet_data[var]]
    for var in jet_branches
})

# Drop NaN rows to ensure valid data
df_jets.dropna(inplace=True)

# Define labels: Assume BTag > 0.5 indicates b-jet, otherwise non-b-jet
df_jets['is_bjet'] = (df_jets['Jet/Jet.BTag'] > 0.5).astype(int)

In [None]:
# Define features and labels
X = df_jets.drop(columns=["Jet/Jet.BTag", "is_bjet"])
y = df_jets["is_bjet"]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)
y_pred_prob = rf_model.predict_proba(X_test)[:, 1]

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Plot ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, color='blue', label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve for b-Jet Classification")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
jet_branches = [
    "Jet/Jet.PT", "Jet/Jet.Eta", "Jet/Jet.Phi", "Jet/Jet.Mass",
    "Jet/Jet.BTag", "Jet/Jet.NCharged", "Jet/Jet.NNeutrals",
    "Jet/Jet.NeutralEnergyFraction", "Jet/Jet.ChargedEnergyFraction",
    "Jet/Jet.Beta", "Jet/Jet.BetaStar", "Jet/Jet.MeanSqDeltaR", "Jet/Jet.PTD"
]