JKusio · Mar 21, 2024
diff --git a/‎.gitignore
+2-1 b/‎.gitignore
+2-1
diff --git a/‎01-SentenceSimilarity.ipynb
+1-1 b/‎01-SentenceSimilarity.ipynb
+1-1
diff --git a/‎03-QuestionAnswering.ipynb
+10-69 b/‎03-QuestionAnswering.ipynb
+10-69
diff --git a/‎04-FineTunningModel.ipynb
+378 b/‎04-FineTunningModel.ipynb
+378
diff --git a/‎requirements.txt
+3-2 b/‎requirements.txt
+3-2
@@ -1,3 +1,4 @@
 env
 .env
-.DS_STORE
+.DS_STORE
+output
@@ -15,7 +15,7 @@
    "source": [
     "## Installing a sample model\n",
     "\n",
-    "We'll use **sentence-transformers/all-MiniLM-L6-v2**. It's a small and lightweight model that will be good for this particular showcase."
+    "We'll use [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2). It's a small and lightweight model that will be good for this particular showcase."
    ]
   },
   {
 
@@ -0,0 +1,378 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Fine-tunning a model\n",
+    "\n",
+    "In this example notebook we'll fine tune a [deepset/roberta-base-squad2](https://huggingface.co/deepset/roberta-base-squad2) with a [poquad dataset](https://huggingface.co/datasets/clarin-pl/poquad)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Downloading poquad dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "poquad = load_dataset(\"clarin-pl/poquad\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Downloading model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "model_name = 'deepset/roberta-base-squad2'\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_name)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Adding preprocessing function\n",
+    "It's from an hugging-face example. More information can be found here - https://huggingface.co/docs/transformers/tasks/question_answering"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def preprocess_function(examples):\n",
+    "    questions = [q.strip() for q in examples[\"question\"]]\n",
+    "    inputs = tokenizer(\n",
+    "        questions,\n",
+    "        examples[\"context\"],\n",
+    "        max_length=384,\n",
+    "        truncation=\"only_second\",\n",
+    "        return_offsets_mapping=True,\n",
+    "        padding=\"max_length\",\n",
+    "    )\n",
+    "\n",
+    "    offset_mapping = inputs.pop(\"offset_mapping\")\n",
+    "    answers = examples[\"answers\"]\n",
+    "    start_positions = []\n",
+    "    end_positions = []\n",
+    "\n",
+    "    for i, offset in enumerate(offset_mapping):\n",
+    "        answer = answers[i]\n",
+    "        start_char = answer[\"answer_start\"][0]\n",
+    "        end_char = answer[\"answer_start\"][0] + len(answer[\"text\"][0])\n",
+    "        sequence_ids = inputs.sequence_ids(i)\n",
+    "\n",
+    "        # Find the start and end of the context\n",
+    "        idx = 0\n",
+    "        while sequence_ids[idx] != 1:\n",
+    "            idx += 1\n",
+    "        context_start = idx\n",
+    "        while sequence_ids[idx] == 1:\n",
+    "            idx += 1\n",
+    "        context_end = idx - 1\n",
+    "\n",
+    "        # If the answer is not fully inside the context, label it (0, 0)\n",
+    "        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:\n",
+    "            start_positions.append(0)\n",
+    "            end_positions.append(0)\n",
+    "        else:\n",
+    "            # Otherwise it's the start and end token positions\n",
+    "            idx = context_start\n",
+    "            while idx <= context_end and offset[idx][0] <= start_char:\n",
+    "                idx += 1\n",
+    "            start_positions.append(idx - 1)\n",
+    "\n",
+    "            idx = context_end\n",
+    "            while idx >= context_start and offset[idx][1] >= end_char:\n",
+    "                idx -= 1\n",
+    "            end_positions.append(idx + 1)\n",
+    "\n",
+    "    inputs[\"start_positions\"] = start_positions\n",
+    "    inputs[\"end_positions\"] = end_positions\n",
+    "    return inputs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Tokenizing the dateset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c39dae923e42422a93f19c38880e79c3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/5764 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "tokenized_poquad = poquad.map(preprocess_function, batched=True, remove_columns=poquad[\"train\"].column_names)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Fine-tuning model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import DefaultDataCollator\n",
+    "\n",
+    "data_collator = DefaultDataCollator()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5a3ff2dafc0a4438ae1df02497a71bbd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/8661 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'loss': 2.366, 'grad_norm': 25.46889305114746, 'learning_rate': 1.884539891467498e-05, 'epoch': 0.17}\n",
+      "{'loss': 2.019, 'grad_norm': 31.06574249267578, 'learning_rate': 1.769079782934996e-05, 'epoch': 0.35}\n",
+      "{'loss': 1.8536, 'grad_norm': 25.971675872802734, 'learning_rate': 1.653619674402494e-05, 'epoch': 0.52}\n",
+      "{'loss': 1.7774, 'grad_norm': 28.223947525024414, 'learning_rate': 1.538159565869992e-05, 'epoch': 0.69}\n",
+      "{'loss': 1.7428, 'grad_norm': 25.615482330322266, 'learning_rate': 1.42269945733749e-05, 'epoch': 0.87}\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "37460c5225d5443c84baba38778c5250",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/361 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 1.5653996467590332, 'eval_runtime': 91.9805, 'eval_samples_per_second': 62.665, 'eval_steps_per_second': 3.925, 'epoch': 1.0}\n",
+      "{'loss': 1.6227, 'grad_norm': 44.27663803100586, 'learning_rate': 1.3072393488049879e-05, 'epoch': 1.04}\n",
+      "{'loss': 1.4481, 'grad_norm': 27.834014892578125, 'learning_rate': 1.1917792402724858e-05, 'epoch': 1.21}\n",
+      "{'loss': 1.4219, 'grad_norm': 34.01536560058594, 'learning_rate': 1.076319131739984e-05, 'epoch': 1.39}\n",
+      "{'loss': 1.431, 'grad_norm': 24.65727996826172, 'learning_rate': 9.60859023207482e-06, 'epoch': 1.56}\n",
+      "{'loss': 1.4034, 'grad_norm': 43.74335861206055, 'learning_rate': 8.453989146749799e-06, 'epoch': 1.73}\n",
+      "{'loss': 1.3625, 'grad_norm': 39.83943176269531, 'learning_rate': 7.299388061424778e-06, 'epoch': 1.91}\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bc208431a52148f0bcaa0d9a405baccc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/361 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 1.4093455076217651, 'eval_runtime': 89.9364, 'eval_samples_per_second': 64.09, 'eval_steps_per_second': 4.014, 'epoch': 2.0}\n",
+      "{'loss': 1.282, 'grad_norm': 24.49555015563965, 'learning_rate': 6.144786976099758e-06, 'epoch': 2.08}\n",
+      "{'loss': 1.187, 'grad_norm': 29.855417251586914, 'learning_rate': 4.990185890774737e-06, 'epoch': 2.25}\n",
+      "{'loss': 1.1464, 'grad_norm': 21.781904220581055, 'learning_rate': 3.835584805449718e-06, 'epoch': 2.42}\n",
+      "{'loss': 1.1625, 'grad_norm': 25.76011848449707, 'learning_rate': 2.680983720124697e-06, 'epoch': 2.6}\n",
+      "{'loss': 1.1572, 'grad_norm': 40.99302673339844, 'learning_rate': 1.5263826347996768e-06, 'epoch': 2.77}\n",
+      "{'loss': 1.147, 'grad_norm': 25.29827880859375, 'learning_rate': 3.7178154947465653e-07, 'epoch': 2.94}\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d68f21bbbe2348f8bc41b520f0f81e62",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/361 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 1.4216176271438599, 'eval_runtime': 90.2408, 'eval_samples_per_second': 63.874, 'eval_steps_per_second': 4.0, 'epoch': 3.0}\n",
+      "{'train_runtime': 8207.5606, 'train_samples_per_second': 16.882, 'train_steps_per_second': 1.055, 'train_loss': 1.4954243963770333, 'epoch': 3.0}\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "TrainOutput(global_step=8661, training_loss=1.4954243963770333, metrics={'train_runtime': 8207.5606, 'train_samples_per_second': 16.882, 'train_steps_per_second': 1.055, 'train_loss': 1.4954243963770333, 'epoch': 3.0})"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer\n",
+    "\n",
+    "model = AutoModelForQuestionAnswering.from_pretrained(model_name)\n",
+    "\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir=\"output/roberta-base-squad2-pl\",\n",
+    "    evaluation_strategy=\"epoch\",\n",
+    "    learning_rate=2e-5,\n",
+    "    per_device_train_batch_size=16,\n",
+    "    per_device_eval_batch_size=16,\n",
+    "    num_train_epochs=3,\n",
+    "    weight_decay=0.01,\n",
+    "    push_to_hub=False,\n",
+    ")\n",
+    "\n",
+    "trainer = Trainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=tokenized_poquad[\"train\"],\n",
+    "    eval_dataset=tokenized_poquad[\"validation\"],\n",
+    "    tokenizer=tokenizer,\n",
+    "    data_collator=data_collator,\n",
+    ")\n",
+    "\n",
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Testing new model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'score': 0.5960702300071716, 'start': 125, 'end': 145, 'answer': 'promieniotwórczością'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import pipeline\n",
+    "\n",
+    "model = AutoModelForQuestionAnswering.from_pretrained(\"output/roberta-base-squad2-pl/checkpoint-8500\")\n",
+    "nlp = pipeline('question-answering', model=model, tokenizer=tokenizer)\n",
+    "\n",
+    "context = 'Maria Skłodowska-Curie była polską i naturalizowaną francuską fizyczką i chemiczką, która prowadziła pionierskie badania nad promieniotwórczością. Była pierwszą kobietą, która zdobyła Nagrodę Nobla, pierwszą osobą i jedyną, która zdobyła Nagrody Nobla w dwóch różnych dziedzinach nauki, i była częścią rodziny Curie, która zdobyła pięć Nagród Nobla.'\n",
+    "question = 'W jakiej dziedzinie Maria Curie prowadziła pionierskie badania?'\n",
+    "\n",
+    "result = nlp(question=question, context=context)\n",
+    "\n",
+    "print(result)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can see a huge improvement in score. From 0.02 on base model to 0.59 on this fine-tuned model. However the answer it not correct, as it would need to be correctly conjugated to `promieniotwórczości` and not `promieniotwórczością`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
@@ -5,9 +5,10 @@ torchaudio
 mlx # used for macos
 jupyterlab
 ipywidgets
-transformers
+transformers==4.38.1
 datasets
 accelerate
 qdrant-client
 elasticsearch
-python-dotenv
+python-dotenv
+evaluate
Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@`
`15`	`15`	`"source": [`
`16`	`16`	`"## Installing a sample model\n",`
`17`	`17`	`"\n",`
`18`		`- "We'll use sentence-transformers/all-MiniLM-L6-v2. It's a small and lightweight model that will be good for this particular showcase."`
	`18`	`+ "We'll use [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2). It's a small and lightweight model that will be good for this particular showcase."`
`19`	`19`	`]`
`20`	`20`	`},`
`21`	`21`	`{`