|
1 | 1 | import uuid
|
2 |
| -from common.names import ( |
3 |
| - CHUNK_SIZES, |
4 |
| - DATASET_NAMES, |
5 |
| - DISTANCES, |
6 |
| - MODEL_NAMES, |
7 |
| - SEMANTIC_TYPES, |
8 |
| -) |
| 2 | +from common.names import CHUNK_SIZES, DATASET_NAMES, DISTANCES, INDEX_NAMES, MODEL_NAMES |
9 | 3 | from common.passage import Passage
|
10 | 4 | import hashlib
|
11 | 5 |
|
@@ -50,25 +44,42 @@ def get_reranker_hash(model: str, query: str, passage_ids: list, count: int):
|
50 | 44 | return "reranker:" + hashed
|
51 | 45 |
|
52 | 46 |
|
53 |
| -def get_all_qdrant_collection_names(): |
54 |
| - names = [] |
55 |
| - for dataset_name in DATASET_NAMES: |
56 |
| - for model_name in MODEL_NAMES: |
57 |
| - for distance in DISTANCES: |
58 |
| - for chunk_size, _ in CHUNK_SIZES: |
59 |
| - name = get_qdrant_collection_name( |
60 |
| - dataset_name, model_name, "character", chunk_size, distance |
61 |
| - ) |
62 |
| - names.append(name) |
63 |
| - |
64 |
| - for semantic_type in SEMANTIC_TYPES: |
65 |
| - name = get_qdrant_collection_name( |
66 |
| - dataset_name, model_name, semantic_type, 1.5, distance |
67 |
| - ) |
68 |
| - names.append(name) |
69 |
| - |
70 |
| - return names |
| 47 | +def get_relevant_document_count_hash(id: str, dataset_key: str): |
| 48 | + hashed = hashlib.sha256((id + dataset_key).encode()).hexdigest() |
| 49 | + return "count:" + hashed |
71 | 50 |
|
72 | 51 |
|
73 | 52 | def get_dataset_key(dataset_name: str, split: str):
|
74 | 53 | return replace_slash_with_dash(f"{dataset_name}-{split}")
|
| 54 | + |
| 55 | + |
| 56 | +def get_all_es_index_combinations(): |
| 57 | + dataset_keys = [ |
| 58 | + get_dataset_key(dataset_name, split) |
| 59 | + for dataset_name in DATASET_NAMES |
| 60 | + for split, _ in CHUNK_SIZES |
| 61 | + ] |
| 62 | + |
| 63 | + return [ |
| 64 | + (index, dataset_key) for index in INDEX_NAMES for dataset_key in dataset_keys |
| 65 | + ] |
| 66 | + |
| 67 | + |
| 68 | +def get_all_qdrant_model_combinations(): |
| 69 | + dataset_keys = [ |
| 70 | + get_dataset_key(dataset_name, split) |
| 71 | + for dataset_name in DATASET_NAMES |
| 72 | + for split, _ in CHUNK_SIZES |
| 73 | + ] |
| 74 | + |
| 75 | + qdrant_collection_names = [ |
| 76 | + get_qdrant_collection_name(model, distance) |
| 77 | + for model in MODEL_NAMES |
| 78 | + for distance in DISTANCES |
| 79 | + ] |
| 80 | + |
| 81 | + return [ |
| 82 | + (collection_name, dataset_key) |
| 83 | + for collection_name in qdrant_collection_names |
| 84 | + for dataset_key in dataset_keys |
| 85 | + ] |
0 commit comments