-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_processing.py
89 lines (65 loc) · 2.58 KB
/
data_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
def extract_post_hoc_data(data):
"""
Extract systems with only O, H, and C1 category adsorbates.
Parameters:
data (list): List of system objects.
Returns:
list: List of filtered system objects with only O, H, and C1 adsorbates.
"""
req_data = []
for system in data:
carbon_count = list(system.atomic_numbers).count(6)
tags = list(system.tags.detach().numpy())
atomic_numbers = system.atomic_numbers.detach().numpy()
indices = [i for i, x in enumerate(tags) if x == 2]
if all(item in [1, 6, 8] for item in atomic_numbers[indices]) and carbon_count <= 1:
req_data.append(system)
return req_data
def extract_sr_data(data):
"""
Extract systems with only H adsorbates.
Parameters:
data (list): List of system objects.
Returns:
list: List of filtered system objects with only H adsorbates.
"""
req_data = []
for system in data:
tags = list(system.tags.detach().numpy())
atomic_numbers = system.atomic_numbers.detach().numpy()
indices = [i for i, x in enumerate(tags) if x == 2]
if all(item in [1] for item in atomic_numbers[indices]):
req_data.append(system)
return req_data
def extract_last_frame(req_data):
"""
Extract the last frame id (fid) from each system id (sid).
Parameters:
req_data (list): List of filtered system objects.
Returns:
dict: Dictionary mapping system ID to the last frame ID.
"""
sid_list = set(item.sid for item in req_data)
last_frames = {sid: max(item.fid for item in req_data if item.sid == sid) for sid in tqdm(sid_list)}
return last_frames
def get_material_data(mapping_path, api_key):
"""
Summary and electronic structure data from the Materials Project API.
Parameters:
mapping_path (str): Path to the pickle file containing the system ID to Materials Project ID mapping.
api_key (str): Your Materials Project API key.
Returns:
tuple: A tuple containing summary and electronic structure data.
"""
# Load data from the pickle file
with open(mapping_path, 'rb') as file:
mapping = pickle.load(file)
# Extract SID and corresponding mp-ids
mp_sid_dict = {}
for sid, info in mapping.items():
mp_sid_dict[sid] = info['bulk_mpid']
mpid_list = list(mp_sid_dict.values())
with MPRester(api_key=api_key) as mpr:
summary = mpr.materials.summary.search(material_ids=mpid_list)
electronic = mpr.materials.electronic_structure.search(material_ids=mpid_list)
return summary, electronic, mp_sid_dict