diff --git a/doccano_client/cli/active_learning/__init__.py b/doccano_client/cli/active_learning/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/doccano_client/cli/active_learning/languages.py b/doccano_client/cli/active_learning/languages.py new file mode 100644 index 0000000..bd5937d --- /dev/null +++ b/doccano_client/cli/active_learning/languages.py @@ -0,0 +1,35 @@ +# fastText embeddings +# https://github.com/flairNLP/flair/blob/cebd2b1c81be4507f62e967f8a2e7701e332dbd3/resources/docs/embeddings/CLASSIC_WORD_EMBEDDINGS.md +LANGUAGES = [ + "en", # English + "de", # German + "nl", # Dutch + "fr", # French + "it", # Italian + "es", # Spanish + "pt", # Portuguese + "ro", # Romanian + "ca", # Catalan + "sv", # Swedish + "da", # Danish + "no", # Norwegian + "fi", # Finnish + "pl", # Polish + "cz", # Czech + "sk", # Slovak + "sl", # Slovenian + "sr", # Serbian + "hr", # Croatian + "bg", # Bulgarian + "ru", # Russian + "ar", # Arabic + "he", # Hebrew + "tr", # Turkish + "fa", # Persian + "ja", # Japanese + "ko", # Korean + "zh", # Chinese + "hi", # Hindi + "id", # Indonesian + "eu", # Basque +] diff --git a/doccano_client/cli/active_learning/manager.py b/doccano_client/cli/active_learning/manager.py new file mode 100644 index 0000000..cbd494e --- /dev/null +++ b/doccano_client/cli/active_learning/manager.py @@ -0,0 +1,72 @@ +import time +from typing import List, Literal, Optional, Tuple + +from flair.trainers import ModelTrainer +from seqal.tagger import SequenceTagger +from tqdm import tqdm + +from doccano_client import DoccanoClient + +from .preparation import DOCCANO_HOME, prepare_datasets +from .strategies import get_query_strategy +from .trainer import get_tagger_params, get_trainer_params + + +def execute_one_iteration( + client: DoccanoClient, + project_id: int, + lang: str = "en", + query_strategy_name: Literal["LC", "MNLP"] = "MNLP", + transformer_model: Optional[str] = None, +) -> Tuple[List[float], List[int]]: + print("Maybe downloading dataset...") + labeled_dataset, unlabeled_dataset = prepare_datasets(client, project_id, lang=lang) + + # Prepare tagger + tagger_params = get_tagger_params(labeled_dataset, lang=lang, transformer_model=transformer_model) + tagger = SequenceTagger(**tagger_params) + + # Prepare trainer + trainer = ModelTrainer(tagger, labeled_dataset) + trainer_params = get_trainer_params() + + print("Training...") + model_dir = DOCCANO_HOME / str(project_id) / "models" + trainer.train(model_dir, **trainer_params) + print("Training completed.") + + # Query unlabeled dataset + print("Calculating confidence scores...") + query_strategy = get_query_strategy(query_strategy_name) + scores = query_strategy(unlabeled_dataset.sentences, tagger) + print("Calculation completed.") + return scores, unlabeled_dataset.ids + + +def execute_active_learning( + client: DoccanoClient, + project_id: int, + lang: str = "en", + query_strategy_name: Literal["LC", "MNLP"] = "MNLP", + transformer_model: Optional[str] = None, + train_frequency: int = 100, +): + prev_completed = 0 + while True: + progress = client.get_progress(project_id) + if progress.is_finished(): + break + if progress.completed - prev_completed >= train_frequency: + prev_completed = progress.completed + scores, example_ids = execute_one_iteration( + client, + project_id=project_id, + lang=lang, + query_strategy_name=query_strategy_name, + transformer_model=transformer_model, + ) + print("Update confidence scores...") + for score, example_id in tqdm(zip(scores, example_ids)): + client.update_example(project_id, example_id, meta={"confidence": score}) + print("Update completed.") + time.sleep(10) diff --git a/doccano_client/cli/active_learning/models.py b/doccano_client/cli/active_learning/models.py new file mode 100644 index 0000000..941d12b --- /dev/null +++ b/doccano_client/cli/active_learning/models.py @@ -0,0 +1,150 @@ +import json +import pathlib +from typing import Dict, Iterable, Iterator, List, Optional, Tuple + +from sklearn.model_selection import train_test_split + +from doccano_client.models.example import Example +from doccano_client.models.label import Span + + +class Examples: + filename = "examples.json" + + def __init__(self, examples: Iterable[Example] = None): + if examples is None: + examples = [] + self.items = {int(example.id): example for example in examples if example.id} + + def __getitem__(self, example_id: Optional[int]) -> Example: + if example_id is None: + raise ValueError("Example ID is None.") + return self.items[example_id] + + @property + def ids(self) -> List[int]: + return list(self.items.keys()) + + def save(self, project_dir: pathlib.Path): + path = project_dir / self.filename + with path.open("w") as f: + examples = [example.dict() for example in self.items.values()] + json.dump(examples, f) + + @classmethod + def load(cls, project_dir: pathlib.Path): + path = project_dir / cls.filename + if not path.exists(): + return cls() + with path.open() as f: + items = [Example.parse_obj(example) for example in json.load(f)] + examples = cls(items) + return examples + + def confirm(self, example_id: Optional[int]): + if example_id is None: + return + self.items[example_id].is_confirmed = True + + def filter_by(self, is_confirmed: bool) -> "Examples": + return Examples(example for example in self.items.values() if example.is_confirmed == is_confirmed) + + def filter_by_ids(self, ids: Iterable[int]) -> "Examples": + return Examples(self.items[example_id] for example_id in ids) + + +class Spans: + filename = "spans.json" + + def __init__(self, spans: Dict[int, List[Span]] = None): + self.items = spans or {} + + def __contains__(self, example_id: Optional[int]) -> bool: + if example_id is None: + return False + return example_id in self.items + + def __getitem__(self, example_id: Optional[int]) -> List[Span]: + if example_id is None: + raise ValueError("Example ID is None.") + if example_id not in self.items: + return [] + return self.items[example_id] + + def add(self, example_id: Optional[int], spans: List[Span]): + if example_id is None: + return + self.items[example_id] = spans + + def save(self, project_dir: pathlib.Path): + path = project_dir / self.filename + with path.open("w") as f: + spans = {example_id: [span.dict() for span in spans] for example_id, spans in self.items.items()} + json.dump(spans, f) + + @classmethod + def load(cls, project_dir: pathlib.Path): + path = project_dir / cls.filename + if not path.exists(): + return cls() + with path.open() as f: + items = json.load(f) + items = {int(example_id): [Span.parse_obj(span) for span in spans] for example_id, spans in items.items()} + spans = cls(items) + return spans + + def filter_by(self, example_ids: List[int]) -> "Spans": + return Spans({example_id: self.items[example_id] for example_id in example_ids}) + + +class NERDataset: + def __init__(self, examples: Examples = None, spans: Spans = None): + self.examples = examples or Examples() + self.spans = spans or Spans() + + def __iter__(self) -> Iterator[Tuple[Example, List[Span]]]: + for example_id in self.examples.ids: + yield self.examples[example_id], self.spans[example_id] + + def split(self, test_size: float = 0.2, random_state: int = 42) -> Iterable["NERDataset"]: + train_ids, test_ids = train_test_split(self.examples.ids, test_size=test_size, random_state=random_state) + train_examples = self.examples.filter_by_ids(train_ids) + train_spans = self.spans.filter_by(train_ids) + test_examples = self.examples.filter_by_ids(test_ids) + test_spans = self.spans.filter_by(test_ids) + return NERDataset(train_examples, train_spans), NERDataset(test_examples, test_spans) + + def save(self, project_dir: pathlib.Path): + self.examples.save(project_dir) + self.spans.save(project_dir) + + @classmethod + def load(cls, project_dir: pathlib.Path): + examples = Examples.load(project_dir) + spans = Spans.load(project_dir) + return cls(examples, spans) + + def add_spans(self, example_id: Optional[int], spans: List[Span]): + if example_id is None: + return + self.spans.add(example_id, spans) + + def has_spans(self, example_id: Optional[int]) -> bool: + if example_id is None: + return False + return example_id in self.spans + + def confirm(self, example_id: Optional[int]): + if example_id is None: + return + self.examples.confirm(example_id) + + @property + def labeled(self) -> "NERDataset": + examples = self.examples.filter_by(is_confirmed=True) + spans = self.spans.filter_by(examples.ids) + return NERDataset(examples, spans) + + @property + def unlabeled(self) -> "NERDataset": + return NERDataset(self.examples.filter_by(is_confirmed=False)) diff --git a/doccano_client/cli/active_learning/preparation.py b/doccano_client/cli/active_learning/preparation.py new file mode 100644 index 0000000..7b65e80 --- /dev/null +++ b/doccano_client/cli/active_learning/preparation.py @@ -0,0 +1,122 @@ +import os +import pathlib +from typing import List + +import spacy +from flair.data import Sentence, Token +from flair.datasets import ColumnCorpus +from spacy.training import offsets_to_biluo_tags + +from doccano_client import DoccanoClient +from doccano_client.models.example import Example +from doccano_client.models.label import Span + +from .models import Examples, NERDataset + +DOCCANO_HOME = pathlib.Path(os.path.expanduser(os.environ.get("DOCCANO_HOME", "~/doccano"))) + + +class UnlabeledDataset: + def __init__(self): + self.items = [] + + @property + def sentences(self): + return [sentence for _, sentence in self.items] + + @property + def ids(self): + return [example_id for example_id, _ in self.items] + + def add(self, example: Example, sentence: Sentence): + self.items.append((example.id, sentence)) + + +def download_dataset(client: DoccanoClient, project_id: int) -> NERDataset: + dataset_dir = DOCCANO_HOME / str(project_id) / "dataset" + if not dataset_dir.exists(): + print(f"Downloading dataset for project {project_id}") + dataset_dir.mkdir(parents=True, exist_ok=True) + examples = Examples(client.list_examples(project_id)) + dataset = NERDataset(examples) + dataset.save(dataset_dir) + else: + print(f"Loading dataset for project {project_id}") + dataset = NERDataset.load(dataset_dir) + + for example in client.list_examples(project_id, is_confirmed=True): + if not dataset.has_spans(example.id): + spans = client.list_spans(project_id, example.id) # type: ignore + dataset.add_spans(example.id, spans) + dataset.confirm(example.id) + dataset.save(dataset_dir) + return dataset + + +def make_nlp(lang: str = "en"): + if lang == "cz": + lang = "cs" + nlp = spacy.blank(lang) + return nlp + + +def prepare_datasets(client: DoccanoClient, project_id: int, lang: str = "en"): + # download dataset + dataset = download_dataset(client, project_id) + + # split train/test dataset + train_dataset, test_dataset = dataset.labeled.split(test_size=0.5) + + # convert dataset to conll format + nlp = make_nlp(lang) + save_dir = DOCCANO_HOME / str(project_id) / "dataset" + export_examples_to_conll(nlp, train_dataset, save_dir / "train.txt") + export_examples_to_conll(nlp, test_dataset, save_dir / "test.txt") + + # load datasets for flair + labeled_dataset = load_labeled_dataset(save_dir) + unlabeled_dataset = load_unlabeled_dataset(nlp, dataset.unlabeled) + return labeled_dataset, unlabeled_dataset + + +def convert_example_to_conll(nlp: spacy.Language, example: Example, spans: List[Span]): + doc = nlp(example.text) # type: ignore + ents = [span.to_tuple() for span in spans] + tags = offsets_to_biluo_tags(doc, ents) # type: ignore + for token, tag in zip(doc, tags): + tag = tag.replace("U-", "S-") + tag = tag.replace("L-", "E-") + yield f"{token.text}\t{tag}\n" + + +def export_examples_to_conll(nlp: spacy.Language, dataset: NERDataset, path: pathlib.Path): + with path.open("w", encoding="utf-8") as f: + for example, spans in dataset: + lines = convert_example_to_conll(nlp, example, spans) + f.writelines(lines) + f.write("\n") + + +def load_labeled_dataset(data_dir: pathlib.Path): + columns = {0: "text", 1: "ner"} + corpus = ColumnCorpus( + data_dir, + columns, + train_file="train.txt", + dev_file="test.txt", + test_file="test.txt", + ) + return corpus + + +def load_unlabeled_dataset(nlp: spacy.Language, dataset: NERDataset): + unlabeled_dataset = UnlabeledDataset() + for example, _ in dataset: + doc = nlp(example.text) # type: ignore + sentence = Sentence() + for word in doc: + token = Token(text=word.text, start_position=word.idx, whitespace_after=word.whitespace_) + token.add_tag("ner", "O") + sentence.add_token(token) + unlabeled_dataset.add(example, sentence) + return unlabeled_dataset diff --git a/doccano_client/cli/active_learning/strategies.py b/doccano_client/cli/active_learning/strategies.py new file mode 100644 index 0000000..32e3207 --- /dev/null +++ b/doccano_client/cli/active_learning/strategies.py @@ -0,0 +1,33 @@ +from typing import List, Literal + +import numpy as np +from flair.data import Sentence +from seqal.tagger import SequenceTagger + + +def least_confidence( + sentences: List[Sentence], + tagger: SequenceTagger, +) -> np.ndarray: + log_probs = tagger.log_probability(sentences) + # scores = 1 - np.exp(log_probs) + scores = np.exp(log_probs) + return scores + + +def maximum_normalized_log_probability( + sentences: List[Sentence], + tagger: SequenceTagger, +) -> np.ndarray: + log_probs = tagger.log_probability(sentences) + lengths = np.array([len(sent) for sent in sentences]) + normed_log_probs = log_probs / lengths + return normed_log_probs + + +def get_query_strategy(query_strategy: Literal["MNLP", "LC"] = "MNLP"): + if query_strategy == "LC": + return least_confidence + elif query_strategy == "MNLP": + return maximum_normalized_log_probability + raise ValueError(f"Query strategy {query_strategy} is not available") diff --git a/doccano_client/cli/active_learning/trainer.py b/doccano_client/cli/active_learning/trainer.py new file mode 100644 index 0000000..553e6b2 --- /dev/null +++ b/doccano_client/cli/active_learning/trainer.py @@ -0,0 +1,52 @@ +from typing import Any, Dict + +from flair.data import Corpus +from flair.embeddings import TransformerWordEmbeddings, WordEmbeddings + +from .languages import LANGUAGES + + +def get_tagger_params( + corpus: Corpus, + lang: str = "en", + transformer_model: str = None, + hidden_size: int = 256, + use_rnn: bool = False, + use_crf: bool = True, + **kwargs, +) -> Dict[str, Any]: + if lang not in LANGUAGES and transformer_model is None: + raise ValueError(f"Language {lang} is not available") + + if transformer_model: + embeddings = TransformerWordEmbeddings(transformer_model) + else: + embeddings = WordEmbeddings("glove") + + tagger_params = { + "tag_type": "ner", + "tag_dictionary": corpus.make_tag_dictionary(tag_type="ner"), + "embeddings": embeddings, + "hidden_size": hidden_size, + "use_rnn": use_rnn, + "use_crf": use_crf, + } + return tagger_params + + +def get_trainer_params( + max_epochs: int = 10, + patience: int = 3, + learning_rate: float = 0.1, + mini_batch_size: int = 32, + shuffle: bool = True, + **kwargs, +) -> Dict[str, Any]: + trainer_params = { + "max_epochs": max_epochs, + "learning_rate": learning_rate, + "mini_batch_size": mini_batch_size, + "patience": patience, + "shuffle": shuffle, + } + return trainer_params diff --git a/doccano_client/cli/commands.py b/doccano_client/cli/commands.py index 03ef7ff..22e3043 100644 --- a/doccano_client/cli/commands.py +++ b/doccano_client/cli/commands.py @@ -7,6 +7,7 @@ from pathlib import Path from doccano_client import DoccanoClient +from doccano_client.cli.active_learning.languages import LANGUAGES from doccano_client.cli.estimators import select_estimator_class from doccano_client.cli.usecases import build_annotator @@ -46,6 +47,21 @@ def command_predict(args): client.logout() +def command_teach(args): + from .active_learning.manager import execute_active_learning + + client = command_login(args) + execute_active_learning( + client, + project_id=args.project, + lang=args.lang, + query_strategy_name=args.query_strategy, + transformer_model=args.transformer_model, + train_frequency=args.train_frequency, + ) + client.logout() + + def command_help(args): print(parser.parse_args([args.command, "--help"])) @@ -70,6 +86,30 @@ def main(): parser_predict.add_argument("--framework", default="spacy", choices=["spacy"], help="framework to predict output") parser_predict.set_defaults(handler=command_predict) + # Create a parser for active learning + parser_teach = subparsers.add_parser("teach", help="see `teach -h`") + parser_teach.add_argument("--task", type=str, choices=["ner"], required=True, help="task name") + parser_teach.add_argument("--project", type=int, required=True, help="project id") + parser_teach.add_argument("--lang", type=str, choices=LANGUAGES, default="en", required=True, help="language code") + parser_teach.add_argument( + "--query_strategy", + type=str, + choices=["LC", "MNLP"], + default="MNLP", + required=True, + help="query strategy. LC is least confidence, MNLP is maximum normalized log-probability.", + ) + parser_teach.add_argument( + "--transformer_model", type=str, required=False, help="transformer model name(e.g. bert-base-uncased)" + ) + parser_teach.add_argument( + "--train_frequency", + type=int, + default=50, + help="How often to train during annotation (number of confirmed examples)", + ) + parser_teach.set_defaults(handler=command_teach) + # Create a parser for help. parser_help = subparsers.add_parser("help", help="see `help -h`") parser_help.add_argument("command", help="command name which help is shown") diff --git a/doccano_client/models/label.py b/doccano_client/models/label.py index cfa3822..5ddc116 100644 --- a/doccano_client/models/label.py +++ b/doccano_client/models/label.py @@ -34,6 +34,9 @@ def check_start_offset_is_less_than_end_offset(cls, values): raise ValueError("start_offset must be less than end_offset.") return values + def to_tuple(self) -> tuple: + return self.start_offset, self.end_offset, self.label + class Relation(Label): from_id: int diff --git a/doccano_client/models/metrics.py b/doccano_client/models/metrics.py index 3261455..d03c45a 100644 --- a/doccano_client/models/metrics.py +++ b/doccano_client/models/metrics.py @@ -22,6 +22,9 @@ class Progress(BaseModel): remaining: int completed: int + def is_finished(self) -> bool: + return self.remaining == 0 + class MemberProgress(BaseModel): username: str diff --git a/docs/cli.md b/docs/cli.md index c800262..0c8f97d 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -71,3 +71,34 @@ docli predict asr \ --project \ --model base ``` + +## Active Learning + +To use this feature, you need to install doccano-client as follows: + +```bash +pip install doccano-client[al] +``` + +First, you need to login to doccano: + +```bash +docli login \ + --host http://127.0.0.1:8000 \ + --username admin \ + --password password +``` + +Then, you can use active learning as follows: + +```bash +docli teach \ + --task \ + --project \ + --lang [en] \ + --query_strategy [MNLP] \ + --train_frequency [50] \ + --transformer_model [bert-base-uncased] +``` + +Currently, only `ner` is supported as a task. diff --git a/poetry.lock b/poetry.lock index 8b40419..d300fc0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -39,12 +39,27 @@ name = "blis" version = "0.9.1" description = "The Blis BLAS-like linear algebra library, as a self-contained C-extension." category = "main" -optional = true +optional = false python-versions = "*" [package.dependencies] numpy = ">=1.15.0" +[[package]] +name = "bpemb" +version = "0.3.4" +description = "Byte-pair embeddings in 275 languages" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +gensim = "*" +numpy = "*" +requests = "*" +sentencepiece = "*" +tqdm = "*" + [[package]] name = "cached-property" version = "1.5.2" @@ -58,7 +73,7 @@ name = "catalogue" version = "2.0.8" description = "Super lightweight function registries for your library" category = "main" -optional = true +optional = false python-versions = ">=3.6" [[package]] @@ -104,19 +119,53 @@ name = "confection" version = "0.0.1" description = "The sweetest config system for Python" category = "main" -optional = true +optional = false python-versions = ">=3.6" [package.dependencies] pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<1.10.0" srsly = ">=2.4.0,<3.0.0" +[[package]] +name = "conllu" +version = "4.5.2" +description = "CoNLL-U Parser parses a CoNLL-U formatted string into a nested python dictionary" +category = "main" +optional = false +python-versions = ">=3.6" + +[[package]] +name = "contourpy" +version = "1.0.5" +description = "Python library for calculating contours of 2D quadrilateral grids" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +numpy = ">=1.16" + +[package.extras] +test-no-codebase = ["pillow", "matplotlib", "pytest"] +test-minimal = ["pytest"] +test = ["isort", "flake8", "pillow", "matplotlib", "pytest"] +docs = ["sphinx-rtd-theme", "sphinx", "docutils (<0.18)"] +bokeh = ["selenium", "bokeh"] + +[[package]] +name = "cycler" +version = "0.11.0" +description = "Composable style cycles" +category = "main" +optional = false +python-versions = ">=3.6" + [[package]] name = "cymem" version = "2.0.6" description = "Manage calls to calloc/free through Cython" category = "main" -optional = true +optional = false python-versions = "*" [[package]] @@ -143,6 +192,20 @@ typing-inspect = ">=0.4.0" [package.extras] dev = ["pytest (>=6.2.3)", "ipython", "mypy (>=0.710)", "hypothesis", "portray", "flake8", "simplejson", "types-dataclasses"] +[[package]] +name = "deprecated" +version = "1.2.13" +description = "Python @deprecated decorator to deprecate old python classes, functions or methods." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[package.dependencies] +wrapt = ">=1.10,<2" + +[package.extras] +dev = ["tox", "bump2version (<1)", "sphinx (<2)", "importlib-metadata (<3)", "importlib-resources (<4)", "configparser (<5)", "sphinxcontrib-websupport (<2)", "zipp (<2)", "PyTest (<5)", "PyTest-Cov (<2.6)", "pytest", "pytest-cov"] + [[package]] name = "ffmpeg-python" version = "0.2.0" @@ -162,13 +225,48 @@ name = "filelock" version = "3.8.0" description = "A platform independent file lock." category = "main" -optional = true +optional = false python-versions = ">=3.7" [package.extras] docs = ["furo (>=2022.6.21)", "sphinx (>=5.1.1)", "sphinx-autodoc-typehints (>=1.19.1)"] testing = ["covdefaults (>=2.2)", "coverage (>=6.4.2)", "pytest (>=7.1.2)", "pytest-cov (>=3)", "pytest-timeout (>=2.1)"] +[[package]] +name = "flair" +version = "0.10" +description = "A very simple framework for state-of-the-art NLP" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +bpemb = ">=0.3.2" +conllu = ">=4.0" +deprecated = ">=1.2.4" +ftfy = "*" +gdown = "3.12.2" +gensim = ">=3.4.0" +huggingface-hub = "*" +janome = "*" +konoha = ">=4.0.0,<5.0.0" +langdetect = "*" +lxml = "*" +matplotlib = ">=2.2.3" +more-itertools = ">=8.8.0,<8.9.0" +mpld3 = "0.3" +python-dateutil = ">=2.6.1" +regex = "*" +scikit-learn = ">=0.21.3" +segtok = ">=1.5.7" +sentencepiece = "0.1.95" +sqlitedict = ">=1.6.0" +tabulate = "*" +torch = ">=1.5.0,<1.8 || >1.8" +tqdm = ">=4.26.0" +transformers = ">=4.0.0" +wikipedia-api = "*" + [[package]] name = "flake8" version = "5.0.4" @@ -182,6 +280,39 @@ mccabe = ">=0.7.0,<0.8.0" pycodestyle = ">=2.9.0,<2.10.0" pyflakes = ">=2.5.0,<2.6.0" +[[package]] +name = "fonttools" +version = "4.37.4" +description = "Tools to manipulate font files" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.extras] +all = ["fs (>=2.2.0,<3)", "lxml (>=4.0,<5)", "zopfli (>=0.1.4)", "lz4 (>=1.7.4.2)", "matplotlib", "sympy", "skia-pathops (>=0.5.0)", "uharfbuzz (>=0.23.0)", "brotlicffi (>=0.8.0)", "scipy", "brotli (>=1.0.1)", "munkres", "unicodedata2 (>=14.0.0)", "xattr"] +graphite = ["lz4 (>=1.7.4.2)"] +interpolatable = ["scipy", "munkres"] +lxml = ["lxml (>=4.0,<5)"] +pathops = ["skia-pathops (>=0.5.0)"] +plot = ["matplotlib"] +repacker = ["uharfbuzz (>=0.23.0)"] +symfont = ["sympy"] +type1 = ["xattr"] +ufo = ["fs (>=2.2.0,<3)"] +unicode = ["unicodedata2 (>=14.0.0)"] +woff = ["zopfli (>=0.1.4)", "brotlicffi (>=0.8.0)", "brotli (>=1.0.1)"] + +[[package]] +name = "ftfy" +version = "6.1.1" +description = "Fixes mojibake and other problems with Unicode, after the fact" +category = "main" +optional = false +python-versions = ">=3.7,<4" + +[package.dependencies] +wcwidth = ">=0.2.5" + [[package]] name = "future" version = "0.18.2" @@ -190,6 +321,39 @@ category = "main" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" +[[package]] +name = "gdown" +version = "3.12.2" +description = "Google Drive direct download of big files." +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +filelock = "*" +requests = {version = "*", extras = ["socks"]} +six = "*" +tqdm = "*" + +[[package]] +name = "gensim" +version = "4.2.0" +description = "Python framework for fast Vector Space Modelling" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +numpy = ">=1.17.0" +scipy = ">=0.18.1" +smart-open = ">=1.8.1" + +[package.extras] +distributed = ["Pyro4 (>=4.27)"] +docs = ["pytest", "pytest-cov", "mock", "cython", "testfixtures", "pyemd", "nmslib", "Pyro4 (>=4.27)", "visdom (>0.1.8.7)", "sphinx", "sphinx-gallery", "sphinxcontrib.programoutput", "sphinxcontrib-napoleon", "matplotlib", "memory-profiler", "annoy", "pyro4", "nltk", "statsmodels", "pandas"] +test = ["pytest", "pytest-cov", "mock", "cython", "testfixtures", "pyemd", "nmslib", "visdom (>0.1.8.7)"] +test-win = ["pytest", "pytest-cov", "mock", "cython", "testfixtures", "pyemd", "nmslib"] + [[package]] name = "ghp-import" version = "2.1.0" @@ -220,7 +384,7 @@ name = "huggingface-hub" version = "0.9.1" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" category = "main" -optional = true +optional = false python-versions = ">=3.7.0" [package.dependencies] @@ -286,6 +450,14 @@ requirements_deprecated_finder = ["pipreqs", "pip-api"] colors = ["colorama (>=0.4.3,<0.5.0)"] plugins = ["setuptools"] +[[package]] +name = "janome" +version = "0.4.2" +description = "Japanese morphological analysis engine." +category = "main" +optional = false +python-versions = "*" + [[package]] name = "jinja2" version = "3.1.2" @@ -300,17 +472,83 @@ MarkupSafe = ">=2.0" [package.extras] i18n = ["Babel (>=2.7)"] +[[package]] +name = "joblib" +version = "1.2.0" +description = "Lightweight pipelining with Python functions" +category = "main" +optional = false +python-versions = ">=3.7" + +[[package]] +name = "kiwisolver" +version = "1.4.4" +description = "A fast implementation of the Cassowary constraint solver" +category = "main" +optional = false +python-versions = ">=3.7" + +[[package]] +name = "konoha" +version = "4.6.3" +description = "A tiny sentence/word tokenizer for Japanese text written in Python" +category = "main" +optional = false +python-versions = ">=3.6.1,<4.0.0" + +[package.dependencies] +overrides = ">=3.0.0,<4.0.0" + +[package.extras] +janome = ["janome (>=0.3.10,<0.4.0)"] +all = ["janome (>=0.3.10,<0.4.0)", "natto-py (>=0.9.0,<0.10.0)", "kytea (>=0.1.4,<0.2.0)", "sentencepiece (>=0.1.85,<0.2.0)", "sudachipy (==0.4.9)", "boto3 (>=1.11.0,<2.0.0)", "fastapi (>=0.54.1,<0.55.0)", "uvicorn (>=0.11.5,<0.12.0)", "sudachidict-core (>=20200330,<20200331)", "nagisa (>=0.2.7,<0.3.0)"] +all_with_integrations = ["janome (>=0.3.10,<0.4.0)", "natto-py (>=0.9.0,<0.10.0)", "kytea (>=0.1.4,<0.2.0)", "sentencepiece (>=0.1.85,<0.2.0)", "sudachipy (==0.4.9)", "boto3 (>=1.11.0,<2.0.0)", "allennlp (>=1.3.0,<2.0.0)", "fastapi (>=0.54.1,<0.55.0)", "uvicorn (>=0.11.5,<0.12.0)", "sudachidict-core (>=20200330,<20200331)", "nagisa (>=0.2.7,<0.3.0)"] +mecab = ["natto-py (>=0.9.0,<0.10.0)"] +kytea = ["kytea (>=0.1.4,<0.2.0)"] +sentencepiece = ["sentencepiece (>=0.1.85,<0.2.0)"] +sudachi = ["sudachipy (==0.4.9)", "sudachidict-core (>=20200330,<20200331)"] +remote = ["boto3 (>=1.11.0,<2.0.0)"] +allennlp = ["allennlp (>=1.3.0,<2.0.0)"] +server = ["fastapi (>=0.54.1,<0.55.0)", "uvicorn (>=0.11.5,<0.12.0)"] +docs = ["sphinx (>=3.1.1,<4.0.0)", "sphinx_rtd_theme (>=0.4.3,<0.5.0)"] +nagisa = ["nagisa (>=0.2.7,<0.3.0)"] + [[package]] name = "langcodes" version = "3.3.0" description = "Tools for labeling human languages with IETF language tags" category = "main" -optional = true +optional = false python-versions = ">=3.6" [package.extras] data = ["language-data (>=1.1,<2.0)"] +[[package]] +name = "langdetect" +version = "1.0.9" +description = "Language detection library ported from Google's language-detection." +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +six = "*" + +[[package]] +name = "lxml" +version = "4.9.1" +description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*" + +[package.extras] +cssselect = ["cssselect (>=0.7)"] +html5 = ["html5lib"] +htmlsoup = ["beautifulsoup4"] +source = ["Cython (>=0.29.7)"] + [[package]] name = "markdown" version = "3.3.7" @@ -372,6 +610,26 @@ python-versions = "*" [package.dependencies] marshmallow = ">=2.0.0" +[[package]] +name = "matplotlib" +version = "3.6.1" +description = "Python plotting package" +category = "main" +optional = false +python-versions = ">=3.8" + +[package.dependencies] +contourpy = ">=1.0.1" +cycler = ">=0.10" +fonttools = ">=4.22.0" +kiwisolver = ">=1.0.1" +numpy = ">=1.19" +packaging = ">=20.0" +pillow = ">=6.2.0" +pyparsing = ">=2.2.1" +python-dateutil = ">=2.7" +setuptools_scm = ">=7" + [[package]] name = "mccabe" version = "0.7.0" @@ -510,6 +768,22 @@ python-versions = ">=3.7" griffe = ">=0.11.1" mkdocstrings = ">=0.19" +[[package]] +name = "more-itertools" +version = "8.8.0" +description = "More routines for operating on iterables, beyond itertools" +category = "main" +optional = false +python-versions = ">=3.5" + +[[package]] +name = "mpld3" +version = "0.3" +description = "D3 Viewer for Matplotlib" +category = "main" +optional = false +python-versions = "*" + [[package]] name = "mslex" version = "0.3.0" @@ -531,7 +805,7 @@ name = "murmurhash" version = "1.0.8" description = "Cython bindings for MurmurHash" category = "main" -optional = true +optional = false python-versions = ">=3.6" [[package]] @@ -565,9 +839,17 @@ name = "numpy" version = "1.23.3" description = "NumPy is the fundamental package for array computing with Python." category = "main" -optional = true +optional = false python-versions = ">=3.8" +[[package]] +name = "overrides" +version = "3.1.0" +description = "A decorator to automatically detect mismatch when overriding a method." +category = "main" +optional = false +python-versions = "*" + [[package]] name = "packaging" version = "21.3" @@ -603,7 +885,7 @@ name = "pathy" version = "0.6.2" description = "pathlib.Path subclasses for local and cloud bucket storage" category = "main" -optional = true +optional = false python-versions = ">= 3.6" [package.dependencies] @@ -616,6 +898,18 @@ s3 = ["boto3"] gcs = ["google-cloud-storage (>=1.26.0,<2.0.0)"] all = ["typer-cli", "mock", "pytest-coverage", "pytest", "boto3", "google-cloud-storage (>=1.26.0,<2.0.0)"] +[[package]] +name = "pillow" +version = "9.2.0" +description = "Python Imaging Library (Fork)" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.extras] +docs = ["furo", "olefile", "sphinx (>=2.4)", "sphinx-copybutton", "sphinx-issues (>=3.0.1)", "sphinx-removed-in", "sphinxext-opengraph"] +tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"] + [[package]] name = "platformdirs" version = "2.5.2" @@ -645,7 +939,7 @@ name = "preshed" version = "3.0.7" description = "Cython hash table that trusts the keys are pre-hashed" category = "main" -optional = true +optional = false python-versions = ">=3.6" [package.dependencies] @@ -747,6 +1041,14 @@ python-versions = "*" flake8 = "5.0.4" tomli = {version = "*", markers = "python_version < \"3.11\""} +[[package]] +name = "pysocks" +version = "1.7.1" +description = "A Python SOCKS client module. See https://github.com/Anorov/PySocks for more information." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + [[package]] name = "pytest" version = "7.1.3" @@ -771,7 +1073,7 @@ testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2. name = "python-dateutil" version = "2.8.2" description = "Extensions to the standard Python datetime module" -category = "dev" +category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" @@ -810,7 +1112,7 @@ name = "regex" version = "2022.9.13" description = "Alternative regular expression module, to replace re." category = "main" -optional = true +optional = false python-versions = ">=3.6" [[package]] @@ -825,6 +1127,7 @@ python-versions = ">=3.7, <4" certifi = ">=2017.4.17" charset-normalizer = ">=2,<3" idna = ">=2.5,<4" +PySocks = {version = ">=1.5.6,<1.5.7 || >1.5.7", optional = true, markers = "extra == \"socks\""} urllib3 = ">=1.21.1,<1.27" [package.extras] @@ -857,11 +1160,97 @@ urllib3 = ">=1.25.10" [package.extras] tests = ["pytest (>=7.0.0)", "coverage (>=6.0.0)", "pytest-cov", "pytest-asyncio", "pytest-localserver", "flake8", "types-mock", "types-requests", "mypy"] +[[package]] +name = "scikit-learn" +version = "1.1.2" +description = "A set of python modules for machine learning and data mining" +category = "main" +optional = false +python-versions = ">=3.8" + +[package.dependencies] +joblib = ">=1.0.0" +numpy = ">=1.17.3" +scipy = ">=1.3.2" +threadpoolctl = ">=2.0.0" + +[package.extras] +tests = ["numpydoc (>=1.2.0)", "pyamg (>=4.0.0)", "mypy (>=0.961)", "black (>=22.3.0)", "flake8 (>=3.8.2)", "pytest-cov (>=2.9.0)", "pytest (>=5.0.1)", "pandas (>=1.0.5)", "scikit-image (>=0.16.2)", "matplotlib (>=3.1.2)"] +examples = ["seaborn (>=0.9.0)", "pandas (>=1.0.5)", "scikit-image (>=0.16.2)", "matplotlib (>=3.1.2)"] +docs = ["sphinxext-opengraph (>=0.4.2)", "sphinx-prompt (>=1.3.0)", "Pillow (>=7.1.2)", "numpydoc (>=1.2.0)", "sphinx-gallery (>=0.7.0)", "sphinx (>=4.0.1)", "memory-profiler (>=0.57.0)", "seaborn (>=0.9.0)", "pandas (>=1.0.5)", "scikit-image (>=0.16.2)", "matplotlib (>=3.1.2)"] +benchmark = ["memory-profiler (>=0.57.0)", "pandas (>=1.0.5)", "matplotlib (>=3.1.2)"] + +[[package]] +name = "scipy" +version = "1.9.2" +description = "Fundamental algorithms for scientific computing in Python" +category = "main" +optional = false +python-versions = ">=3.8" + +[package.dependencies] +numpy = ">=1.18.5,<1.26.0" + +[package.extras] +test = ["pytest", "pytest-cov", "pytest-xdist", "asv", "mpmath", "gmpy2", "threadpoolctl", "scikit-umfpack"] +doc = ["sphinx (!=4.1.0)", "pydata-sphinx-theme (==0.9.0)", "sphinx-panels (>=0.5.2)", "matplotlib (>2)", "numpydoc", "sphinx-tabs"] +dev = ["mypy", "typing-extensions", "pycodestyle", "flake8"] + +[[package]] +name = "segtok" +version = "1.5.11" +description = "sentence segmentation and word tokenization tools" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +regex = "*" + +[[package]] +name = "sentencepiece" +version = "0.1.95" +description = "SentencePiece python wrapper" +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "seqal" +version = "0.3.4" +description = "Sequence labeling active learning framework for Python" +category = "main" +optional = false +python-versions = ">=3.8,<4.0" + +[package.dependencies] +flair = "0.10" +scipy = ">=1.8.0,<2.0.0" +spacy = ">=3.4.1,<4.0.0" +torch = ">=1.10.0,<2.0.0" + +[[package]] +name = "setuptools-scm" +version = "7.0.5" +description = "the blessed package to manage your versions by scm tags" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +packaging = ">=20.0" +tomli = ">=1.0.0" +typing-extensions = "*" + +[package.extras] +test = ["pytest (>=6.2)", "virtualenv (>20)"] +toml = ["setuptools (>=42)"] + [[package]] name = "six" version = "1.16.0" description = "Python 2 and 3 compatibility utilities" -category = "dev" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" @@ -870,7 +1259,7 @@ name = "smart-open" version = "5.2.1" description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storage, gzip, bz2...)" category = "main" -optional = true +optional = false python-versions = ">=3.6,<4.0" [package.extras] @@ -887,7 +1276,7 @@ name = "spacy" version = "3.4.1" description = "Industrial-strength Natural Language Processing (NLP) in Python" category = "main" -optional = true +optional = false python-versions = ">=3.6" [package.dependencies] @@ -949,7 +1338,7 @@ name = "spacy-legacy" version = "3.0.10" description = "Legacy registered functions for spaCy backwards compatibility" category = "main" -optional = true +optional = false python-versions = ">=3.6" [[package]] @@ -957,7 +1346,7 @@ name = "spacy-loggers" version = "1.0.3" description = "Logging utilities for SpaCy" category = "main" -optional = true +optional = false python-versions = ">=3.6" [package.dependencies] @@ -1007,17 +1396,36 @@ cuda90 = ["cupy-cuda90 (>=5.0.0b4)"] cuda91 = ["cupy-cuda91 (>=5.0.0b4)"] cuda92 = ["cupy-cuda92 (>=5.0.0b4)"] +[[package]] +name = "sqlitedict" +version = "2.0.0" +description = "Persistent dict in Python, backed up by sqlite3 and pickle, multithread-safe." +category = "main" +optional = false +python-versions = "*" + [[package]] name = "srsly" version = "2.4.4" description = "Modern high-performance serialization utilities for Python" category = "main" -optional = true +optional = false python-versions = ">=3.6" [package.dependencies] catalogue = ">=2.0.3,<2.1.0" +[[package]] +name = "tabulate" +version = "0.9.0" +description = "Pretty-print tabular data" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.extras] +widechars = ["wcwidth"] + [[package]] name = "taskipy" version = "1.10.3" @@ -1037,7 +1445,7 @@ name = "thinc" version = "8.1.1" description = "A refreshing functional take on deep learning, compatible with your favorite libraries" category = "main" -optional = true +optional = false python-versions = ">=3.6" [package.dependencies] @@ -1072,12 +1480,20 @@ mxnet = ["mxnet (>=1.5.1,<1.6.0)"] tensorflow = ["tensorflow (>=2.0.0,<2.6.0)"] torch = ["torch (>=1.6.0)"] +[[package]] +name = "threadpoolctl" +version = "3.1.0" +description = "threadpoolctl" +category = "main" +optional = false +python-versions = ">=3.6" + [[package]] name = "tokenizers" version = "0.12.1" description = "Fast and Customizable Tokenizers" category = "main" -optional = true +optional = false python-versions = "*" [package.extras] @@ -1088,7 +1504,7 @@ testing = ["pytest", "requests", "numpy", "datasets"] name = "tomli" version = "2.0.1" description = "A lil' TOML parser" -category = "dev" +category = "main" optional = false python-versions = ">=3.7" @@ -1097,7 +1513,7 @@ name = "torch" version = "1.12.1" description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration" category = "main" -optional = true +optional = false python-versions = ">=3.7.0" [package.dependencies] @@ -1108,7 +1524,7 @@ name = "tqdm" version = "4.64.1" description = "Fast, Extensible Progress Meter" category = "main" -optional = true +optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7" [package.dependencies] @@ -1125,7 +1541,7 @@ name = "transformers" version = "4.21.3" description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow" category = "main" -optional = true +optional = false python-versions = ">=3.7.0" [package.dependencies] @@ -1186,7 +1602,7 @@ name = "typer" version = "0.4.2" description = "Typer, build great CLIs. Easy to code. Based on Python type hints." category = "main" -optional = true +optional = false python-versions = ">=3.6" [package.dependencies] @@ -1269,7 +1685,7 @@ name = "wasabi" version = "0.10.1" description = "A lightweight console printing and formatting toolkit" category = "main" -optional = true +optional = false python-versions = "*" [[package]] @@ -1283,11 +1699,30 @@ python-versions = ">=3.6" [package.extras] watchmedo = ["PyYAML (>=3.10)"] +[[package]] +name = "wcwidth" +version = "0.2.5" +description = "Measures the displayed width of unicode strings in a terminal" +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "wikipedia-api" +version = "0.5.4" +description = "Python Wrapper for Wikipedia" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +requests = "*" + [[package]] name = "wrapt" version = "1.14.1" description = "Module for decorators, wrappers and monkey patching." -category = "dev" +category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" @@ -1322,12 +1757,13 @@ whisper = ["ffmpeg-python", "tqdm"] [metadata] lock-version = "1.1" python-versions = "^3.8" -content-hash = "1adf7f2ae26bfd5204c3d7002c7a16bfa45f4cd0e8722e538ed17c65bc24fd45" +content-hash = "9eccff9b94ba2b7eb61c7cb5b8befe35cda83b30d23332128fef797765a41c17" [metadata.files] attrs = [] black = [] blis = [] +bpemb = [] cached-property = [] catalogue = [] certifi = [] @@ -1338,6 +1774,9 @@ click = [ ] colorama = [] confection = [] +conllu = [] +contourpy = [] +cycler = [] cymem = [ {file = "cymem-2.0.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:700540b68e96a7056d0691d467df2bbaaf0934a3e6fe2383669998cbee19580a"}, {file = "cymem-2.0.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a261f51796a2705f3900ed22b8442519a0f230f50a816fb5bd89cb9b027dc5ac"}, @@ -1361,10 +1800,16 @@ cymem = [ ] darglint = [] dataclasses-json = [] +deprecated = [] ffmpeg-python = [] filelock = [] +flair = [] flake8 = [] +fonttools = [] +ftfy = [] future = [] +gdown = [] +gensim = [] ghp-import = [] griffe = [] huggingface-hub = [] @@ -1378,14 +1823,20 @@ isort = [ {file = "isort-5.10.1-py3-none-any.whl", hash = "sha256:6f62d78e2f89b4500b080fe3a81690850cd254227f27f75c3a0c491a1f351ba7"}, {file = "isort-5.10.1.tar.gz", hash = "sha256:e8443a5e7a020e9d7f97f1d7d9cd17c88bcb3bc7e218bf9cf5095fe550be2951"}, ] +janome = [] jinja2 = [ {file = "Jinja2-3.1.2-py3-none-any.whl", hash = "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"}, {file = "Jinja2-3.1.2.tar.gz", hash = "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852"}, ] +joblib = [] +kiwisolver = [] +konoha = [] langcodes = [ {file = "langcodes-3.3.0-py3-none-any.whl", hash = "sha256:4d89fc9acb6e9c8fdef70bcdf376113a3db09b67285d9e1d534de6d8818e7e69"}, {file = "langcodes-3.3.0.tar.gz", hash = "sha256:794d07d5a28781231ac335a1561b8442f8648ca07cd518310aeb45d6f0807ef6"}, ] +langdetect = [] +lxml = [] markdown = [] markdown-callouts = [] markupsafe = [ @@ -1432,6 +1883,7 @@ markupsafe = [ ] marshmallow = [] marshmallow-enum = [] +matplotlib = [] mccabe = [] mergedeep = [] mkdocs = [] @@ -1442,6 +1894,8 @@ mkdocs-same-dir = [] mkdocstrings = [] mkdocstrings-crystal = [] mkdocstrings-python = [] +more-itertools = [] +mpld3 = [] mslex = [] multidict = [] murmurhash = [] @@ -1451,6 +1905,7 @@ mypy-extensions = [ {file = "mypy_extensions-0.4.3.tar.gz", hash = "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"}, ] numpy = [] +overrides = [] packaging = [ {file = "packaging-21.3-py3-none-any.whl", hash = "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"}, {file = "packaging-21.3.tar.gz", hash = "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"}, @@ -1458,6 +1913,7 @@ packaging = [ partial-tagger = [] pathspec = [] pathy = [] +pillow = [] platformdirs = [ {file = "platformdirs-2.5.2-py3-none-any.whl", hash = "sha256:027d8e83a2d7de06bbac4e5ef7e023c02b863d7ea5d079477e722bb41ab25788"}, {file = "platformdirs-2.5.2.tar.gz", hash = "sha256:58c8abb07dcb441e6ee4b11d8df0ac856038f944ab98b7be6b27b2a3c7feef19"}, @@ -1482,6 +1938,7 @@ pyparsing = [ {file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"}, ] pyproject-flake8 = [] +pysocks = [] pytest = [] python-dateutil = [ {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, @@ -1494,6 +1951,12 @@ regex = [] requests = [] requests-toolbelt = [] responses = [] +scikit-learn = [] +scipy = [] +segtok = [] +sentencepiece = [] +seqal = [] +setuptools-scm = [] six = [ {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, @@ -1508,9 +1971,15 @@ spacy-legacy = [] spacy-loggers = [] spacy-partial-tagger = [] spacy-transformers = [] +sqlitedict = [] srsly = [] +tabulate = [] taskipy = [] thinc = [] +threadpoolctl = [ + {file = "threadpoolctl-3.1.0-py3-none-any.whl", hash = "sha256:8b99adda265feb6773280df41eece7b2e6561b772d21ffd52e372f999024907b"}, + {file = "threadpoolctl-3.1.0.tar.gz", hash = "sha256:a335baacfaa4400ae1f0d8e3a58d6674d2f8828e3716bb2802c44955ad391380"}, +] tokenizers = [] tomli = [ {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, @@ -1528,6 +1997,11 @@ urllib3 = [] vcrpy = [] wasabi = [] watchdog = [] +wcwidth = [ + {file = "wcwidth-0.2.5-py2.py3-none-any.whl", hash = "sha256:beb4802a9cebb9144e99086eff703a642a13d6a0052920003a230f3294bbe784"}, + {file = "wcwidth-0.2.5.tar.gz", hash = "sha256:c4d647b99872929fdb7bdcaa4fbe7f01413ed3d98077df798530e5b04f116c83"}, +] +wikipedia-api = [] wrapt = [] yarl = [] zipp = [] diff --git a/pyproject.toml b/pyproject.toml index abf1a25..5081609 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,8 @@ spacy-partial-tagger = { version = "^0.9.1", optional = true } tqdm = { version = "^4.64.1", optional = true } pydantic = "^1.9.2" requests-toolbelt = "^0.9.1" -ffmpeg-python = {version = "^0.2.0", extras = ["whisper"]} +ffmpeg-python = { version = "^0.2.0", optional = true } +seqal = { version = "^0.3.4", optional = true } [tool.poetry.dev-dependencies] flake8 = "^5.0.4" @@ -48,6 +49,7 @@ mkdocstrings = {extras = ["python", "crystal"], version = "^0.19.0"} [tool.poetry.extras] spacy = ["spacy", "spacy-partial-tagger", "tqdm"] whisper = ["ffmpeg-python", "tqdm"] +al = ["spacy", "seqal"] [build-system] requires = ["poetry-core>=1.0.0"]