{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from tensorflow.keras.models import Sequential\n",
    "from tensorflow.keras.layers import Dense, Dropout\n",
    "from tensorflow.keras.optimizers import SGD\n",
    "\n",
    "from freeforestml import Variable, Process, Cut, \\\n",
    "                    HepNet, ClassicalCV, EstimatorNormalizer, \\\n",
    "                    HistogramFactory, confusion_matrix, atlasify, \\\n",
    "                    McStack\n",
    "from freeforestml import toydata, example_style\n",
    "example_style()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = toydata.get()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "p_ztt = Process(r\"$Z\\rightarrow\\tau\\tau$\", range=(0, 0))\n",
    "p_sig = Process(r\"Signal\", range=(1, 1))\n",
    "s_all = McStack(p_ztt, p_sig)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "hist_factory = HistogramFactory(df, stacks=[s_all], weight=\"weight\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Cut-based"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "First, we set up a cut-based event selection as a benchmark."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "hist_factory(Variable(\"$\\Delta \\eta^{jj}$\",\n",
    "                      lambda d: (d.jet_1_eta - d.jet_2_eta).abs()),\n",
    "             bins=20, range=(0, 8))\n",
    "hist_factory(Variable(\"$m^{jj}$\", \"m_jj\"),\n",
    "             bins=20, range=(0, 1500))\n",
    "None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "c_sr = Cut(lambda d: d.m_jj  > 400) & \\\n",
    "        Cut(lambda d: d.jet_2_pt >= 30) & \\\n",
    "        Cut(lambda d: d.jet_1_eta * d.jet_2_eta < 0) & \\\n",
    "        Cut(lambda d: (d.jet_2_eta - d.jet_1_eta).abs() > 3)\n",
    "c_sr.label = \"Signal\"\n",
    "\n",
    "c_rest = (~c_sr)\n",
    "c_rest.label = \"Rest\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "confusion_matrix(df, [p_sig, p_ztt], [c_sr, c_rest],\n",
    "                 x_label=\"Signal\", y_label=\"Region\", annot=True, weight=\"weight\")\n",
    "confusion_matrix(df, [p_sig, p_ztt], [c_sr, c_rest], normalize_rows=True,\n",
    "                 x_label=\"Signal\", y_label=\"Region\", annot=True, weight=\"weight\")\n",
    "None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Neural Network"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['dijet_deta'] = (df.jet_1_eta - df.jet_2_eta).abs()\n",
    "df['dijet_prod_eta'] = (df.jet_1_eta * df.jet_2_eta)\n",
    "input_var = ['dijet_prod_eta', 'm_jj', 'dijet_deta', 'higgs_pt', 'jet_2_pt', 'jet_1_eta', 'jet_2_eta', 'tau_eta']\n",
    "\n",
    "output_var = ['is_sig', 'is_ztt']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[\"is_sig\"] = p_sig.selection.idx_array(df)\n",
    "df[\"is_ztt\"] = p_ztt.selection.idx_array(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sample_df = df.sample(frac=1000 / len(df)).compute()\n",
    "sns.pairplot(sample_df, vars=input_var, hue=\"is_sig\")\n",
    "None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def model():\n",
    "    m = Sequential()\n",
    "    m.add(Dense(units=15, activation='relu', input_dim=len(input_var)))\n",
    "    m.add(Dense(units=5, activation='relu'))\n",
    "    m.add(Dense(units=2, activation='softmax'))\n",
    "    \n",
    "    m.compile(loss='categorical_crossentropy',\n",
    "              optimizer=SGD(lr=0.1),\n",
    "              weighted_metrics=['categorical_accuracy'])\n",
    "\n",
    "    return m\n",
    "\n",
    "cv = ClassicalCV(5, frac_var='random')\n",
    "\n",
    "net = HepNet(model, cv, EstimatorNormalizer, input_var, output_var)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sig_wf = len(p_sig.selection(df).weight) / p_sig.selection(df).weight.sum()\n",
    "ztt_wf = len(p_ztt.selection(df).weight) / p_ztt.selection(df).weight.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "net.fit(df.compute(), epochs=150, verbose=0, batch_size=2048,\n",
    "        weight=Variable(\"weight\", lambda d: d.weight * (d.is_sig * sig_wf + d.is_ztt * ztt_wf)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.lineplot(x='epoch', y='loss', data=net.history, label=\"Training\")\n",
    "sns.lineplot(x='epoch', y='val_loss', data=net.history, label=\"Validation\")\n",
    "plt.ylabel(\"loss\")\n",
    "atlasify(False, \"FreeForestML Example\")\n",
    "None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Accuracy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.lineplot(x='epoch', y='categorical_accuracy', data=net.history, label=\"Training\")\n",
    "sns.lineplot(x='epoch', y='val_categorical_accuracy', data=net.history, label=\"Validation\")\n",
    "plt.ylabel(\"Accuracy\")\n",
    "atlasify(False, \"FreeForestML Example\")\n",
    "None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.lineplot(x='epoch', y='val_categorical_accuracy', data=net.history, hue=\"fold\")\n",
    "plt.legend(loc=4)\n",
    "atlasify(False, \"FreeForestML Example\")\n",
    "None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "out = net.predict(df.compute(), cv='test')\n",
    "out['pred_sig'] = out.pred_is_sig >= 0.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "c_pred_sig = Process(\"Signal\", lambda d: d.pred_is_sig >= 0.5)\n",
    "c_pred_ztt = Process(r\"$Z\\rightarrow\\tau\\tau$\", lambda d: d.pred_is_sig < 0.5) \n",
    "\n",
    "confusion_matrix(out, [p_sig, p_ztt], [c_pred_sig, c_pred_ztt],\n",
    "                 x_label=\"Truth\", y_label=\"Classification\", annot=True, weight=\"weight\")\n",
    "confusion_matrix(out, [p_sig, p_ztt], [c_pred_sig, c_pred_ztt], normalize_rows=True,\n",
    "                 x_label=\"Truth\", y_label=\"Classification\", annot=True, weight=\"weight\")\n",
    "None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Export to lwtnn"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In order to use the network in lwtnn, we need to export the neural network with the `export()` method. This export one network per fold. It is the reposibility of the use to implement the cross validation in the analysis framework."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "net.export(\"lwtnn\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!ls lwtnn*"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The final, manuel step is to run the lwtnn's converter using the shortcut script `test.sh`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}