david-engelmann · david-engelmann · Oct 21, 2022 · Oct 19, 2022 · Oct 21, 2022 · Oct 21, 2022
diff --git a/doccano_client/cli/active_learning/__init__.py b/doccano_client/cli/active_learning/__init__.py
diff --git a/doccano_client/cli/active_learning/languages.py b/doccano_client/cli/active_learning/languages.py
@@ -0,0 +1,35 @@
+# fastText embeddings
+# https://github.com/flairNLP/flair/blob/cebd2b1c81be4507f62e967f8a2e7701e332dbd3/resources/docs/embeddings/CLASSIC_WORD_EMBEDDINGS.md
+LANGUAGES = [
+    "en",  # English
+    "de",  # German
+    "nl",  # Dutch
+    "fr",  # French
+    "it",  # Italian
+    "es",  # Spanish
+    "pt",  # Portuguese
+    "ro",  # Romanian
+    "ca",  # Catalan
+    "sv",  # Swedish
+    "da",  # Danish
+    "no",  # Norwegian
+    "fi",  # Finnish
+    "pl",  # Polish
+    "cz",  # Czech
+    "sk",  # Slovak
+    "sl",  # Slovenian
+    "sr",  # Serbian
+    "hr",  # Croatian
+    "bg",  # Bulgarian
+    "ru",  # Russian
+    "ar",  # Arabic
+    "he",  # Hebrew
+    "tr",  # Turkish
+    "fa",  # Persian
+    "ja",  # Japanese
+    "ko",  # Korean
+    "zh",  # Chinese
+    "hi",  # Hindi
+    "id",  # Indonesian
+    "eu",  # Basque
+]
diff --git a/doccano_client/cli/active_learning/manager.py b/doccano_client/cli/active_learning/manager.py
@@ -0,0 +1,72 @@
+import time
+from typing import List, Literal, Optional, Tuple
+
+from flair.trainers import ModelTrainer
+from seqal.tagger import SequenceTagger
+from tqdm import tqdm
+
+from doccano_client import DoccanoClient
+
+from .preparation import DOCCANO_HOME, prepare_datasets
+from .strategies import get_query_strategy
+from .trainer import get_tagger_params, get_trainer_params
+
+
+def execute_one_iteration(
+    client: DoccanoClient,
+    project_id: int,
+    lang: str = "en",
+    query_strategy_name: Literal["LC", "MNLP"] = "MNLP",
+    transformer_model: Optional[str] = None,
+) -> Tuple[List[float], List[int]]:
+    print("Maybe downloading dataset...")
+    labeled_dataset, unlabeled_dataset = prepare_datasets(client, project_id, lang=lang)
+
+    # Prepare tagger
+    tagger_params = get_tagger_params(labeled_dataset, lang=lang, transformer_model=transformer_model)
+    tagger = SequenceTagger(**tagger_params)
+
+    # Prepare trainer
+    trainer = ModelTrainer(tagger, labeled_dataset)
+    trainer_params = get_trainer_params()
+
+    print("Training...")
+    model_dir = DOCCANO_HOME / str(project_id) / "models"
+    trainer.train(model_dir, **trainer_params)
+    print("Training completed.")
+
+    # Query unlabeled dataset
+    print("Calculating confidence scores...")
+    query_strategy = get_query_strategy(query_strategy_name)
+    scores = query_strategy(unlabeled_dataset.sentences, tagger)
+    print("Calculation completed.")
+    return scores, unlabeled_dataset.ids
+
+
+def execute_active_learning(
+    client: DoccanoClient,
+    project_id: int,
+    lang: str = "en",
+    query_strategy_name: Literal["LC", "MNLP"] = "MNLP",
+    transformer_model: Optional[str] = None,
+    train_frequency: int = 100,
+):
+    prev_completed = 0
+    while True:
+        progress = client.get_progress(project_id)
+        if progress.is_finished():
+            break
+        if progress.completed - prev_completed >= train_frequency:
+            prev_completed = progress.completed
+            scores, example_ids = execute_one_iteration(
+                client,
+                project_id=project_id,
+                lang=lang,
+                query_strategy_name=query_strategy_name,
+                transformer_model=transformer_model,
+            )
+            print("Update confidence scores...")
+            for score, example_id in tqdm(zip(scores, example_ids)):
+                client.update_example(project_id, example_id, meta={"confidence": score})
+            print("Update completed.")
+        time.sleep(10)
diff --git a/doccano_client/cli/active_learning/models.py b/doccano_client/cli/active_learning/models.py
@@ -0,0 +1,150 @@
+import json
+import pathlib
+from typing import Dict, Iterable, Iterator, List, Optional, Tuple
+
+from sklearn.model_selection import train_test_split
+
+from doccano_client.models.example import Example
+from doccano_client.models.label import Span
+
+
+class Examples:
+    filename = "examples.json"
+
+    def __init__(self, examples: Iterable[Example] = None):
+        if examples is None:
+            examples = []
+        self.items = {int(example.id): example for example in examples if example.id}
+
+    def __getitem__(self, example_id: Optional[int]) -> Example:
+        if example_id is None:
+            raise ValueError("Example ID is None.")
+        return self.items[example_id]
+
+    @property
+    def ids(self) -> List[int]:
+        return list(self.items.keys())
+
+    def save(self, project_dir: pathlib.Path):
+        path = project_dir / self.filename
+        with path.open("w") as f:
+            examples = [example.dict() for example in self.items.values()]
+            json.dump(examples, f)
+
+    @classmethod
+    def load(cls, project_dir: pathlib.Path):
+        path = project_dir / cls.filename
+        if not path.exists():
+            return cls()
+        with path.open() as f:
+            items = [Example.parse_obj(example) for example in json.load(f)]
+        examples = cls(items)
+        return examples
+
+    def confirm(self, example_id: Optional[int]):
+        if example_id is None:
+            return
+        self.items[example_id].is_confirmed = True
+
+    def filter_by(self, is_confirmed: bool) -> "Examples":
+        return Examples(example for example in self.items.values() if example.is_confirmed == is_confirmed)
+
+    def filter_by_ids(self, ids: Iterable[int]) -> "Examples":
+        return Examples(self.items[example_id] for example_id in ids)
+
+
+class Spans:
+    filename = "spans.json"
+
+    def __init__(self, spans: Dict[int, List[Span]] = None):
+        self.items = spans or {}
+
+    def __contains__(self, example_id: Optional[int]) -> bool:
+        if example_id is None:
+            return False
+        return example_id in self.items
+
+    def __getitem__(self, example_id: Optional[int]) -> List[Span]:
+        if example_id is None:
+            raise ValueError("Example ID is None.")
+        if example_id not in self.items:
+            return []
+        return self.items[example_id]
+
+    def add(self, example_id: Optional[int], spans: List[Span]):
+        if example_id is None:
+            return
+        self.items[example_id] = spans
+
+    def save(self, project_dir: pathlib.Path):
+        path = project_dir / self.filename
+        with path.open("w") as f:
+            spans = {example_id: [span.dict() for span in spans] for example_id, spans in self.items.items()}
+            json.dump(spans, f)
+
+    @classmethod
+    def load(cls, project_dir: pathlib.Path):
+        path = project_dir / cls.filename
+        if not path.exists():
+            return cls()
+        with path.open() as f:
+            items = json.load(f)
+        items = {int(example_id): [Span.parse_obj(span) for span in spans] for example_id, spans in items.items()}
+        spans = cls(items)
+        return spans
+
+    def filter_by(self, example_ids: List[int]) -> "Spans":
+        return Spans({example_id: self.items[example_id] for example_id in example_ids})
+
+
+class NERDataset:
+    def __init__(self, examples: Examples = None, spans: Spans = None):
+        self.examples = examples or Examples()
+        self.spans = spans or Spans()
+
+    def __iter__(self) -> Iterator[Tuple[Example, List[Span]]]:
+        for example_id in self.examples.ids:
+            yield self.examples[example_id], self.spans[example_id]
+
+    def split(self, test_size: float = 0.2, random_state: int = 42) -> Iterable["NERDataset"]:
+        train_ids, test_ids = train_test_split(self.examples.ids, test_size=test_size, random_state=random_state)
+        train_examples = self.examples.filter_by_ids(train_ids)
+        train_spans = self.spans.filter_by(train_ids)
+        test_examples = self.examples.filter_by_ids(test_ids)
+        test_spans = self.spans.filter_by(test_ids)
+        return NERDataset(train_examples, train_spans), NERDataset(test_examples, test_spans)
+
+    def save(self, project_dir: pathlib.Path):
+        self.examples.save(project_dir)
+        self.spans.save(project_dir)
+
+    @classmethod
+    def load(cls, project_dir: pathlib.Path):
+        examples = Examples.load(project_dir)
+        spans = Spans.load(project_dir)
+        return cls(examples, spans)
+
+    def add_spans(self, example_id: Optional[int], spans: List[Span]):
+        if example_id is None:
+            return
+        self.spans.add(example_id, spans)
+
+    def has_spans(self, example_id: Optional[int]) -> bool:
+        if example_id is None:
+            return False
+        return example_id in self.spans
+
+    def confirm(self, example_id: Optional[int]):
+        if example_id is None:
+            return
+        self.examples.confirm(example_id)
+
+    @property
+    def labeled(self) -> "NERDataset":
+        examples = self.examples.filter_by(is_confirmed=True)
+        spans = self.spans.filter_by(examples.ids)
+        return NERDataset(examples, spans)
+
+    @property
+    def unlabeled(self) -> "NERDataset":
+        return NERDataset(self.examples.filter_by(is_confirmed=False))
diff --git a/doccano_client/cli/active_learning/preparation.py b/doccano_client/cli/active_learning/preparation.py
@@ -0,0 +1,122 @@
+import os
+import pathlib
+from typing import List
+
+import spacy
+from flair.data import Sentence, Token
+from flair.datasets import ColumnCorpus
+from spacy.training import offsets_to_biluo_tags
+
+from doccano_client import DoccanoClient
+from doccano_client.models.example import Example
+from doccano_client.models.label import Span
+
+from .models import Examples, NERDataset
+
+DOCCANO_HOME = pathlib.Path(os.path.expanduser(os.environ.get("DOCCANO_HOME", "~/doccano")))
+
+
+class UnlabeledDataset:
+    def __init__(self):
+        self.items = []
+
+    @property
+    def sentences(self):
+        return [sentence for _, sentence in self.items]
+
+    @property
+    def ids(self):
+        return [example_id for example_id, _ in self.items]
+
+    def add(self, example: Example, sentence: Sentence):
+        self.items.append((example.id, sentence))
+
+
+def download_dataset(client: DoccanoClient, project_id: int) -> NERDataset:
+    dataset_dir = DOCCANO_HOME / str(project_id) / "dataset"
+    if not dataset_dir.exists():
+        print(f"Downloading dataset for project {project_id}")
+        dataset_dir.mkdir(parents=True, exist_ok=True)
+        examples = Examples(client.list_examples(project_id))
+        dataset = NERDataset(examples)
+        dataset.save(dataset_dir)
+    else:
+        print(f"Loading dataset for project {project_id}")
+        dataset = NERDataset.load(dataset_dir)
+
+    for example in client.list_examples(project_id, is_confirmed=True):
+        if not dataset.has_spans(example.id):
+            spans = client.list_spans(project_id, example.id)  # type: ignore
+            dataset.add_spans(example.id, spans)
+            dataset.confirm(example.id)
+    dataset.save(dataset_dir)
+    return dataset
+
+
+def make_nlp(lang: str = "en"):
+    if lang == "cz":
+        lang = "cs"
+    nlp = spacy.blank(lang)
+    return nlp
+
+
+def prepare_datasets(client: DoccanoClient, project_id: int, lang: str = "en"):
+    # download dataset
+    dataset = download_dataset(client, project_id)
+
+    # split train/test dataset
+    train_dataset, test_dataset = dataset.labeled.split(test_size=0.5)
+
+    # convert dataset to conll format
+    nlp = make_nlp(lang)
+    save_dir = DOCCANO_HOME / str(project_id) / "dataset"
+    export_examples_to_conll(nlp, train_dataset, save_dir / "train.txt")
+    export_examples_to_conll(nlp, test_dataset, save_dir / "test.txt")
+
+    # load datasets for flair
+    labeled_dataset = load_labeled_dataset(save_dir)
+    unlabeled_dataset = load_unlabeled_dataset(nlp, dataset.unlabeled)
+    return labeled_dataset, unlabeled_dataset
+
+
+def convert_example_to_conll(nlp: spacy.Language, example: Example, spans: List[Span]):
+    doc = nlp(example.text)  # type: ignore
+    ents = [span.to_tuple() for span in spans]
+    tags = offsets_to_biluo_tags(doc, ents)  # type: ignore
+    for token, tag in zip(doc, tags):
+        tag = tag.replace("U-", "S-")
+        tag = tag.replace("L-", "E-")
+        yield f"{token.text}\t{tag}\n"
+
+
+def export_examples_to_conll(nlp: spacy.Language, dataset: NERDataset, path: pathlib.Path):
+    with path.open("w", encoding="utf-8") as f:
+        for example, spans in dataset:
+            lines = convert_example_to_conll(nlp, example, spans)
+            f.writelines(lines)
+            f.write("\n")
+
+
+def load_labeled_dataset(data_dir: pathlib.Path):
+    columns = {0: "text", 1: "ner"}
+    corpus = ColumnCorpus(
+        data_dir,
+        columns,
+        train_file="train.txt",
+        dev_file="test.txt",
+        test_file="test.txt",
+    )
+    return corpus
+
+
+def load_unlabeled_dataset(nlp: spacy.Language, dataset: NERDataset):
+    unlabeled_dataset = UnlabeledDataset()
+    for example, _ in dataset:
+        doc = nlp(example.text)  # type: ignore
+        sentence = Sentence()
+        for word in doc:
+            token = Token(text=word.text, start_position=word.idx, whitespace_after=word.whitespace_)
+            token.add_tag("ner", "O")
+            sentence.add_token(token)
+        unlabeled_dataset.add(example, sentence)
+    return unlabeled_dataset