{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Classification" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from tensorflow.keras.models import Sequential\n", "from tensorflow.keras.layers import Dense, Dropout\n", "from tensorflow.keras.optimizers import SGD\n", "\n", "from freeforestml import Variable, Process, Cut, \\\n", " HepNet, ClassicalCV, EstimatorNormalizer, \\\n", " HistogramFactory, confusion_matrix, atlasify, \\\n", " McStack\n", "from freeforestml import toydata, example_style\n", "example_style()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = toydata.get()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "p_ztt = Process(r\"$Z\\rightarrow\\tau\\tau$\", range=(0, 0))\n", "p_sig = Process(r\"Signal\", range=(1, 1))\n", "s_all = McStack(p_ztt, p_sig)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "hist_factory = HistogramFactory(df, stacks=[s_all], weight=\"weight\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Cut-based" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "First, we set up a cut-based event selection as a benchmark." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "hist_factory(Variable(\"$\\Delta \\eta^{jj}$\",\n", " lambda d: (d.jet_1_eta - d.jet_2_eta).abs()),\n", " bins=20, range=(0, 8))\n", "hist_factory(Variable(\"$m^{jj}$\", \"m_jj\"),\n", " bins=20, range=(0, 1500))\n", "None" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "c_sr = Cut(lambda d: d.m_jj > 400) & \\\n", " Cut(lambda d: d.jet_2_pt >= 30) & \\\n", " Cut(lambda d: d.jet_1_eta * d.jet_2_eta < 0) & \\\n", " Cut(lambda d: (d.jet_2_eta - d.jet_1_eta).abs() > 3)\n", "c_sr.label = \"Signal\"\n", "\n", "c_rest = (~c_sr)\n", "c_rest.label = \"Rest\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "confusion_matrix(df, [p_sig, p_ztt], [c_sr, c_rest],\n", " x_label=\"Signal\", y_label=\"Region\", annot=True, weight=\"weight\")\n", "confusion_matrix(df, [p_sig, p_ztt], [c_sr, c_rest], normalize_rows=True,\n", " x_label=\"Signal\", y_label=\"Region\", annot=True, weight=\"weight\")\n", "None" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Neural Network" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df['dijet_deta'] = (df.jet_1_eta - df.jet_2_eta).abs()\n", "df['dijet_prod_eta'] = (df.jet_1_eta * df.jet_2_eta)\n", "input_var = ['dijet_prod_eta', 'm_jj', 'dijet_deta', 'higgs_pt', 'jet_2_pt', 'jet_1_eta', 'jet_2_eta', 'tau_eta']\n", "\n", "output_var = ['is_sig', 'is_ztt']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"is_sig\"] = p_sig.selection.idx_array(df)\n", "df[\"is_ztt\"] = p_ztt.selection.idx_array(df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sample_df = df.sample(frac=1000 / len(df)).compute()\n", "sns.pairplot(sample_df, vars=input_var, hue=\"is_sig\")\n", "None" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def model():\n", " m = Sequential()\n", " m.add(Dense(units=15, activation='relu', input_dim=len(input_var)))\n", " m.add(Dense(units=5, activation='relu'))\n", " m.add(Dense(units=2, activation='softmax'))\n", " \n", " m.compile(loss='categorical_crossentropy',\n", " optimizer=SGD(lr=0.1),\n", " weighted_metrics=['categorical_accuracy'])\n", "\n", " return m\n", "\n", "cv = ClassicalCV(5, frac_var='random')\n", "\n", "net = HepNet(model, cv, EstimatorNormalizer, input_var, output_var)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sig_wf = len(p_sig.selection(df).weight) / p_sig.selection(df).weight.sum()\n", "ztt_wf = len(p_ztt.selection(df).weight) / p_ztt.selection(df).weight.sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "net.fit(df.compute(), epochs=150, verbose=0, batch_size=2048,\n", " weight=Variable(\"weight\", lambda d: d.weight * (d.is_sig * sig_wf + d.is_ztt * ztt_wf)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sns.lineplot(x='epoch', y='loss', data=net.history, label=\"Training\")\n", "sns.lineplot(x='epoch', y='val_loss', data=net.history, label=\"Validation\")\n", "plt.ylabel(\"loss\")\n", "atlasify(False, \"FreeForestML Example\")\n", "None" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Accuracy" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sns.lineplot(x='epoch', y='categorical_accuracy', data=net.history, label=\"Training\")\n", "sns.lineplot(x='epoch', y='val_categorical_accuracy', data=net.history, label=\"Validation\")\n", "plt.ylabel(\"Accuracy\")\n", "atlasify(False, \"FreeForestML Example\")\n", "None" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sns.lineplot(x='epoch', y='val_categorical_accuracy', data=net.history, hue=\"fold\")\n", "plt.legend(loc=4)\n", "atlasify(False, \"FreeForestML Example\")\n", "None" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "out = net.predict(df.compute(), cv='test')\n", "out['pred_sig'] = out.pred_is_sig >= 0.5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "c_pred_sig = Process(\"Signal\", lambda d: d.pred_is_sig >= 0.5)\n", "c_pred_ztt = Process(r\"$Z\\rightarrow\\tau\\tau$\", lambda d: d.pred_is_sig < 0.5) \n", "\n", "confusion_matrix(out, [p_sig, p_ztt], [c_pred_sig, c_pred_ztt],\n", " x_label=\"Truth\", y_label=\"Classification\", annot=True, weight=\"weight\")\n", "confusion_matrix(out, [p_sig, p_ztt], [c_pred_sig, c_pred_ztt], normalize_rows=True,\n", " x_label=\"Truth\", y_label=\"Classification\", annot=True, weight=\"weight\")\n", "None" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Export to lwtnn" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In order to use the network in lwtnn, we need to export the neural network with the `export()` method. This export one network per fold. It is the reposibility of the use to implement the cross validation in the analysis framework." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "net.export(\"lwtnn\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!ls lwtnn*" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The final, manuel step is to run the lwtnn's converter using the shortcut script `test.sh`." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.14" } }, "nbformat": 4, "nbformat_minor": 2 }