-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathensemble.py
125 lines (79 loc) · 4.79 KB
/
ensemble.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
""" Ensembles context-free and context-dependent LSCD predictions made with predict.py. """
# only used for visualizations
from scipy.stats import spearmanr
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import argparse
import os
parser = argparse.ArgumentParser(description="Ensembles context-free and context-dependent LSCD predictions made with predict.py.")
parser.add_argument("context_free_dir", type=str, help="Context-free experiment folder")
parser.add_argument("context_dependent_dir", type=str, help="Context-dependent experiment folder")
parser.add_argument("--plot_all", dest="plot_all", action="store_true", help="Plots and evaluates all possible weights")
def ensemble(dataset_dir="", context_free_dir="", context_dependent_dir="", plot_all=False):
""" Ensembles a context-free and a context-dependent lexical semantic change ranking. """
# organisational data and directory checks
if not context_free_dir.endswith("/"):
context_free_dir += "/"
if not context_dependent_dir.endswith("/"):
context_dependent_dir += "/"
cf_model_name, cf_dataset_name = context_free_dir.split("/")[-2].split("_")
cd_model_name, cd_dataset_name = context_dependent_dir.split("/")[-2].split("_")
assert cf_model_name == "context-free" and cd_model_name == "context-dependent", "Experiments labeld wrongly, check argument order!"
assert cf_dataset_name == cd_dataset_name, "Not experiments on the same dataset!"
if not os.path.exists("experiments"):
os.mkdir("experiments")
dataset_dir = "datasets/{}/".format(cf_dataset_name)
experiment_dir = "experiments/ensemble-circe_{}/".format(cf_dataset_name)
os.makedirs(experiment_dir, exist_ok=True)
print("Experiment data will be stored in {} ...".format(experiment_dir))
# ensemble experiment execution
cf_pred_df = pd.read_csv(context_free_dir + "prediction.tsv", sep="\t", names=["word", "change"], header=None).sort_values("word")
cd_pred_df = pd.read_csv(context_dependent_dir + "prediction.tsv", sep="\t", names=["word", "change"], header=None).sort_values("word")
assert np.all(cf_pred_df.word.values == cd_pred_df.word.values), "Predictions do not correspond to the same target words!"
rank_cf = np.argsort(np.argsort(cf_pred_df.change.values))
rank_cd = np.argsort(np.argsort(cd_pred_df.change.values))
acc_bert = np.load(context_dependent_dir + "bert/classification_accuracy.npy")
w_circe = 2 * acc_bert - 1
r_circe = w_circe * rank_cd + (1 - w_circe) * rank_cf
rank_circe = np.argsort(np.argsort(r_circe))
circe_pred_df = pd.DataFrame({"word": cf_pred_df.word.values, "change": rank_circe})
circe_pred_df.to_csv(experiment_dir + "prediction.tsv", sep="\t", index=False, header=False)
print("Finished ensembling. Prediction can be found in {}.".format(experiment_dir + "prediction.tsv"))
# visualization
if plot_all:
print("Plotting all possible weights ...")
truth_fp = dataset_dir + "truth.tsv"
assert os.path.exists(truth_fp), "No truth.tsv found in {}!".format(dataset_dir)
y_df = pd.read_csv(truth_fp, sep="\t", names=["word", "change"], header=None).sort_values("word")
assert np.all(y_df.word.values == cd_pred_df.word.values), "Predictions do not correspond to truth target words!"
y = y_df.change.values
# value space of w_circe = 2.0 * acc_bert - 1.0 contains 51 weigths since acc_bert is rounded to 2 decimals
weights = np.linspace(0.0, 1.0, 51)
correlations = []
for w in weights:
r_pred = w * rank_cd + (1 - w) * rank_cf
rank_pred = np.argsort(np.argsort(r_pred))
corr, _ = spearmanr(rank_pred, y)
correlations.append(corr)
visualize_weights(weights, correlations, w_circe, experiment_dir.split("/")[-2], experiment_dir + "all_weights.png")
def visualize_weights(weights, correlations, w_circe, experiment_name, export_fp):
""" Visualizes all possible ensemble weights for CIRCE. """
plt.plot(weights, correlations, color="b", label="CIRCE Performance")
plt.axhline(y=max(correlations[0], correlations[-1]), color="r", linestyle="--", label="Best Component Performance")
plt.scatter(w_circe, correlations[np.argmin(np.abs(weights - w_circe))], c="g", s=50, label="Submission Weight")
plt.legend()
plt.ylim(0.0, 1.0)
plt.xlim(0.0, 1.0)
plt.xticks([0.0, 0.2, 0.4, 0.6, 0.8, 1.0], labels=["Context-Free", 0.2, 0.4, 0.6, 0.8, "Context-Dependent"])
plt.xlabel("CIRCE Weight")
plt.ylabel("Spearman Correlation")
plt.suptitle(experiment_name)
plt.tight_layout()
plt.subplots_adjust(top=0.90)
plt.savefig(export_fp)
plt.show()
if __name__ == "__main__":
args = parser.parse_args()
params = vars(args)
ensemble(**params)