Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge Master updates into fork master #10

Merged
merged 8 commits into from
Oct 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
35 changes: 35 additions & 0 deletions doccano_client/cli/active_learning/languages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# fastText embeddings
# https://github.com/flairNLP/flair/blob/cebd2b1c81be4507f62e967f8a2e7701e332dbd3/resources/docs/embeddings/CLASSIC_WORD_EMBEDDINGS.md
LANGUAGES = [
"en", # English
"de", # German
"nl", # Dutch
"fr", # French
"it", # Italian
"es", # Spanish
"pt", # Portuguese
"ro", # Romanian
"ca", # Catalan
"sv", # Swedish
"da", # Danish
"no", # Norwegian
"fi", # Finnish
"pl", # Polish
"cz", # Czech
"sk", # Slovak
"sl", # Slovenian
"sr", # Serbian
"hr", # Croatian
"bg", # Bulgarian
"ru", # Russian
"ar", # Arabic
"he", # Hebrew
"tr", # Turkish
"fa", # Persian
"ja", # Japanese
"ko", # Korean
"zh", # Chinese
"hi", # Hindi
"id", # Indonesian
"eu", # Basque
]
72 changes: 72 additions & 0 deletions doccano_client/cli/active_learning/manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import time
from typing import List, Literal, Optional, Tuple

from flair.trainers import ModelTrainer
from seqal.tagger import SequenceTagger
from tqdm import tqdm

from doccano_client import DoccanoClient

from .preparation import DOCCANO_HOME, prepare_datasets
from .strategies import get_query_strategy
from .trainer import get_tagger_params, get_trainer_params


def execute_one_iteration(
client: DoccanoClient,
project_id: int,
lang: str = "en",
query_strategy_name: Literal["LC", "MNLP"] = "MNLP",
transformer_model: Optional[str] = None,
) -> Tuple[List[float], List[int]]:
print("Maybe downloading dataset...")
labeled_dataset, unlabeled_dataset = prepare_datasets(client, project_id, lang=lang)

# Prepare tagger
tagger_params = get_tagger_params(labeled_dataset, lang=lang, transformer_model=transformer_model)
tagger = SequenceTagger(**tagger_params)

# Prepare trainer
trainer = ModelTrainer(tagger, labeled_dataset)
trainer_params = get_trainer_params()

print("Training...")
model_dir = DOCCANO_HOME / str(project_id) / "models"
trainer.train(model_dir, **trainer_params)
print("Training completed.")

# Query unlabeled dataset
print("Calculating confidence scores...")
query_strategy = get_query_strategy(query_strategy_name)
scores = query_strategy(unlabeled_dataset.sentences, tagger)
print("Calculation completed.")
return scores, unlabeled_dataset.ids


def execute_active_learning(
client: DoccanoClient,
project_id: int,
lang: str = "en",
query_strategy_name: Literal["LC", "MNLP"] = "MNLP",
transformer_model: Optional[str] = None,
train_frequency: int = 100,
):
prev_completed = 0
while True:
progress = client.get_progress(project_id)
if progress.is_finished():
break
if progress.completed - prev_completed >= train_frequency:
prev_completed = progress.completed
scores, example_ids = execute_one_iteration(
client,
project_id=project_id,
lang=lang,
query_strategy_name=query_strategy_name,
transformer_model=transformer_model,
)
print("Update confidence scores...")
for score, example_id in tqdm(zip(scores, example_ids)):
client.update_example(project_id, example_id, meta={"confidence": score})
print("Update completed.")
time.sleep(10)
150 changes: 150 additions & 0 deletions doccano_client/cli/active_learning/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
import json
import pathlib
from typing import Dict, Iterable, Iterator, List, Optional, Tuple

from sklearn.model_selection import train_test_split

from doccano_client.models.example import Example
from doccano_client.models.label import Span


class Examples:
filename = "examples.json"

def __init__(self, examples: Iterable[Example] = None):
if examples is None:
examples = []
self.items = {int(example.id): example for example in examples if example.id}

def __getitem__(self, example_id: Optional[int]) -> Example:
if example_id is None:
raise ValueError("Example ID is None.")
return self.items[example_id]

@property
def ids(self) -> List[int]:
return list(self.items.keys())

def save(self, project_dir: pathlib.Path):
path = project_dir / self.filename
with path.open("w") as f:
examples = [example.dict() for example in self.items.values()]
json.dump(examples, f)

@classmethod
def load(cls, project_dir: pathlib.Path):
path = project_dir / cls.filename
if not path.exists():
return cls()
with path.open() as f:
items = [Example.parse_obj(example) for example in json.load(f)]
examples = cls(items)
return examples

def confirm(self, example_id: Optional[int]):
if example_id is None:
return
self.items[example_id].is_confirmed = True

def filter_by(self, is_confirmed: bool) -> "Examples":
return Examples(example for example in self.items.values() if example.is_confirmed == is_confirmed)

def filter_by_ids(self, ids: Iterable[int]) -> "Examples":
return Examples(self.items[example_id] for example_id in ids)


class Spans:
filename = "spans.json"

def __init__(self, spans: Dict[int, List[Span]] = None):
self.items = spans or {}

def __contains__(self, example_id: Optional[int]) -> bool:
if example_id is None:
return False
return example_id in self.items

def __getitem__(self, example_id: Optional[int]) -> List[Span]:
if example_id is None:
raise ValueError("Example ID is None.")
if example_id not in self.items:
return []
return self.items[example_id]

def add(self, example_id: Optional[int], spans: List[Span]):
if example_id is None:
return
self.items[example_id] = spans

def save(self, project_dir: pathlib.Path):
path = project_dir / self.filename
with path.open("w") as f:
spans = {example_id: [span.dict() for span in spans] for example_id, spans in self.items.items()}
json.dump(spans, f)

@classmethod
def load(cls, project_dir: pathlib.Path):
path = project_dir / cls.filename
if not path.exists():
return cls()
with path.open() as f:
items = json.load(f)
items = {int(example_id): [Span.parse_obj(span) for span in spans] for example_id, spans in items.items()}
spans = cls(items)
return spans

def filter_by(self, example_ids: List[int]) -> "Spans":
return Spans({example_id: self.items[example_id] for example_id in example_ids})


class NERDataset:
def __init__(self, examples: Examples = None, spans: Spans = None):
self.examples = examples or Examples()
self.spans = spans or Spans()

def __iter__(self) -> Iterator[Tuple[Example, List[Span]]]:
for example_id in self.examples.ids:
yield self.examples[example_id], self.spans[example_id]

def split(self, test_size: float = 0.2, random_state: int = 42) -> Iterable["NERDataset"]:
train_ids, test_ids = train_test_split(self.examples.ids, test_size=test_size, random_state=random_state)
train_examples = self.examples.filter_by_ids(train_ids)
train_spans = self.spans.filter_by(train_ids)
test_examples = self.examples.filter_by_ids(test_ids)
test_spans = self.spans.filter_by(test_ids)
return NERDataset(train_examples, train_spans), NERDataset(test_examples, test_spans)

def save(self, project_dir: pathlib.Path):
self.examples.save(project_dir)
self.spans.save(project_dir)

@classmethod
def load(cls, project_dir: pathlib.Path):
examples = Examples.load(project_dir)
spans = Spans.load(project_dir)
return cls(examples, spans)

def add_spans(self, example_id: Optional[int], spans: List[Span]):
if example_id is None:
return
self.spans.add(example_id, spans)

def has_spans(self, example_id: Optional[int]) -> bool:
if example_id is None:
return False
return example_id in self.spans

def confirm(self, example_id: Optional[int]):
if example_id is None:
return
self.examples.confirm(example_id)

@property
def labeled(self) -> "NERDataset":
examples = self.examples.filter_by(is_confirmed=True)
spans = self.spans.filter_by(examples.ids)
return NERDataset(examples, spans)

@property
def unlabeled(self) -> "NERDataset":
return NERDataset(self.examples.filter_by(is_confirmed=False))
122 changes: 122 additions & 0 deletions doccano_client/cli/active_learning/preparation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import os
import pathlib
from typing import List

import spacy
from flair.data import Sentence, Token
from flair.datasets import ColumnCorpus
from spacy.training import offsets_to_biluo_tags

from doccano_client import DoccanoClient
from doccano_client.models.example import Example
from doccano_client.models.label import Span

from .models import Examples, NERDataset

DOCCANO_HOME = pathlib.Path(os.path.expanduser(os.environ.get("DOCCANO_HOME", "~/doccano")))


class UnlabeledDataset:
def __init__(self):
self.items = []

@property
def sentences(self):
return [sentence for _, sentence in self.items]

@property
def ids(self):
return [example_id for example_id, _ in self.items]

def add(self, example: Example, sentence: Sentence):
self.items.append((example.id, sentence))


def download_dataset(client: DoccanoClient, project_id: int) -> NERDataset:
dataset_dir = DOCCANO_HOME / str(project_id) / "dataset"
if not dataset_dir.exists():
print(f"Downloading dataset for project {project_id}")
dataset_dir.mkdir(parents=True, exist_ok=True)
examples = Examples(client.list_examples(project_id))
dataset = NERDataset(examples)
dataset.save(dataset_dir)
else:
print(f"Loading dataset for project {project_id}")
dataset = NERDataset.load(dataset_dir)

for example in client.list_examples(project_id, is_confirmed=True):
if not dataset.has_spans(example.id):
spans = client.list_spans(project_id, example.id) # type: ignore
dataset.add_spans(example.id, spans)
dataset.confirm(example.id)
dataset.save(dataset_dir)
return dataset


def make_nlp(lang: str = "en"):
if lang == "cz":
lang = "cs"
nlp = spacy.blank(lang)
return nlp


def prepare_datasets(client: DoccanoClient, project_id: int, lang: str = "en"):
# download dataset
dataset = download_dataset(client, project_id)

# split train/test dataset
train_dataset, test_dataset = dataset.labeled.split(test_size=0.5)

# convert dataset to conll format
nlp = make_nlp(lang)
save_dir = DOCCANO_HOME / str(project_id) / "dataset"
export_examples_to_conll(nlp, train_dataset, save_dir / "train.txt")
export_examples_to_conll(nlp, test_dataset, save_dir / "test.txt")

# load datasets for flair
labeled_dataset = load_labeled_dataset(save_dir)
unlabeled_dataset = load_unlabeled_dataset(nlp, dataset.unlabeled)
return labeled_dataset, unlabeled_dataset


def convert_example_to_conll(nlp: spacy.Language, example: Example, spans: List[Span]):
doc = nlp(example.text) # type: ignore
ents = [span.to_tuple() for span in spans]
tags = offsets_to_biluo_tags(doc, ents) # type: ignore
for token, tag in zip(doc, tags):
tag = tag.replace("U-", "S-")
tag = tag.replace("L-", "E-")
yield f"{token.text}\t{tag}\n"


def export_examples_to_conll(nlp: spacy.Language, dataset: NERDataset, path: pathlib.Path):
with path.open("w", encoding="utf-8") as f:
for example, spans in dataset:
lines = convert_example_to_conll(nlp, example, spans)
f.writelines(lines)
f.write("\n")


def load_labeled_dataset(data_dir: pathlib.Path):
columns = {0: "text", 1: "ner"}
corpus = ColumnCorpus(
data_dir,
columns,
train_file="train.txt",
dev_file="test.txt",
test_file="test.txt",
)
return corpus


def load_unlabeled_dataset(nlp: spacy.Language, dataset: NERDataset):
unlabeled_dataset = UnlabeledDataset()
for example, _ in dataset:
doc = nlp(example.text) # type: ignore
sentence = Sentence()
for word in doc:
token = Token(text=word.text, start_position=word.idx, whitespace_after=word.whitespace_)
token.add_tag("ner", "O")
sentence.add_token(token)
unlabeled_dataset.add(example, sentence)
return unlabeled_dataset
Loading