1
+ import requests
2
+ from urllib import parse
3
+ import time
4
+ import os
5
+ import math
6
+ import pandas as pd
7
+
8
+ REQUEST_PAUSE = 0.2 # experimental pause not to bump into requests/minute limit
9
+
10
+ class VacancyAggregator :
11
+ baseUrl = 'https://api.hh.ru/vacancies/'
12
+ totalPages = float ('inf' )
13
+ vacancies = {}
14
+ preparedVacancies = []
15
+ professional_role = 10
16
+ params = {}
17
+
18
+ def make_a_list (self ):
19
+ return []
20
+
21
+ def make_a_dictionary (self ):
22
+ return {}
23
+
24
+ def __init__ (self , role = 10 ):
25
+ self .professional_role = role
26
+ self .params = {
27
+ 'area' : 1 , # 1 - москва
28
+ 'professional_role' : role ,
29
+ # 10 - аналитик
30
+ # 134 - финансовый аналитик, инвестиционный аналитик
31
+ # 156 - BI-аналитик, аналитик данных
32
+ # 163 - маркетолог-аналитик
33
+ # 164 - продуктовый аналитик
34
+ 'per_page' : 100
35
+ }
36
+ self .vacancies = self .make_a_dictionary ()
37
+ self .preparedVacancies = self .make_a_list ()
38
+ print ('VacancyAggregator constructed' )
39
+
40
+ def getVacancy (self , url ):
41
+ data = dict ()
42
+ try :
43
+ response = requests .get (url )
44
+ data = response .json ()
45
+ if response .status_code == 200 :
46
+ print ('ok' , url )
47
+ else :
48
+ print ('not ok' , response .status_code , url )
49
+ response .close ()
50
+ except Exception as exc :
51
+ print (f'{ url } made exception: { exc } ' )
52
+ return data
53
+
54
+ def getVacancies (self , url ):
55
+ print (f'getting vacancies for { self .professional_role } and params: { self .params } ' )
56
+ data = {}
57
+ try :
58
+ response = requests .get (url )
59
+ data = response .json ()
60
+ except Exception as exc :
61
+ print (f'{ url } made exception: { exc } ' )
62
+ if (math .isinf (self .totalPages )):
63
+ self .totalPages = data ['pages' ]
64
+ if 'items' not in data :
65
+ print (url , ' failed to load data' )
66
+ response .close ()
67
+ return
68
+ for item in data ['items' ]:
69
+ time .sleep (REQUEST_PAUSE )
70
+ details = self .getVacancy (self .baseUrl + item ['id' ])
71
+ newVacancy = {** item , ** details }
72
+ self .vacancies [item ['id' ]] = newVacancy
73
+ response .close ()
74
+
75
+ def prepareVacancy (self , vacancyId , vacancy ):
76
+ if vacancy ['salary' ] != None :
77
+ salary_from = vacancy ['salary' ]['from' ]
78
+ salary_to = vacancy ['salary' ]['to' ]
79
+ else :
80
+ salary_from = None
81
+ salary_to = None
82
+ if vacancy ['address' ] != None :
83
+ address_raw = vacancy ['address' ]['raw' ]
84
+ else :
85
+ address_raw = None
86
+ if 'key_skills' in vacancy and vacancy ['key_skills' ] != None :
87
+ def getSkillName (skill ):
88
+ return skill ['name' ]
89
+ keySkills = ',' .join (map (getSkillName , vacancy ['key_skills' ]))
90
+ else :
91
+ keySkills = ''
92
+ if 'description' in vacancy and vacancy ['description' ] != None :
93
+ description = vacancy ['description' ]
94
+ else :
95
+ description = ''
96
+ self .preparedVacancies .append ([
97
+ vacancy ['id' ],
98
+ vacancy ['premium' ],
99
+ vacancy ['name' ],
100
+ vacancy ['has_test' ],
101
+ vacancy ['response_letter_required' ],
102
+ vacancy ['area' ]['id' ],
103
+ vacancy ['area' ]['name' ],
104
+ salary_from ,
105
+ salary_to ,
106
+ vacancy ['type' ]['name' ],
107
+ address_raw ,
108
+ vacancy ['response_url' ],
109
+ vacancy ['sort_point_distance' ],
110
+ vacancy ['published_at' ],
111
+ vacancy ['created_at' ],
112
+ vacancy ['archived' ],
113
+ description ,
114
+ keySkills ,
115
+ vacancy ['apply_alternate_url' ],
116
+ vacancy ['insider_interview' ],
117
+ vacancy ['url' ],
118
+ vacancy ['alternate_url' ],
119
+ vacancy ['relations' ],
120
+ vacancy ['employer' ]['name' ],
121
+ vacancy ['snippet' ]['requirement' ],
122
+ vacancy ['snippet' ]['responsibility' ],
123
+ vacancy ['contacts' ],
124
+ vacancy ['schedule' ]['name' ],
125
+ vacancy ['working_days' ],
126
+ vacancy ['working_time_intervals' ],
127
+ vacancy ['working_time_modes' ],
128
+ vacancy ['accept_temporary' ]
129
+ ])
130
+
131
+ def saveToXlsx (self ):
132
+ os .makedirs ('./data/' , exist_ok = True )
133
+
134
+ for vacancyId , vacancy in self .vacancies .items ():
135
+ self .prepareVacancy (vacancyId , vacancy )
136
+
137
+ vacanciesDF = pd .DataFrame (self .preparedVacancies ,
138
+ columns = [
139
+ 'id' ,
140
+ 'premium' ,
141
+ 'name' ,
142
+ 'has_test' ,
143
+ 'response_letter_required' ,
144
+ 'area_id' ,
145
+ 'area_name' ,
146
+ 'salary_from' ,
147
+ 'salary_to' ,
148
+ 'type_name' ,
149
+ 'address_raw' ,
150
+ 'response_url' ,
151
+ 'sort_point_distance' ,
152
+ 'published_at' ,
153
+ 'created_at' ,
154
+ 'archived' ,
155
+ 'description' ,
156
+ 'key_skills' ,
157
+ 'apply_alternate_url' ,
158
+ 'insider_interview' ,
159
+ 'url' ,
160
+ 'alternate_url' ,
161
+ 'relations' ,
162
+ 'employer_name' ,
163
+ 'snippet_requirement' ,
164
+ 'snippet_responsibility' ,
165
+ 'contacts' ,
166
+ 'schedule_name' ,
167
+ 'working_days' ,
168
+ 'working_time_intervals' ,
169
+ 'working_time_modes' ,
170
+ 'accept_temporary'
171
+ ]
172
+ )
173
+ vacanciesDF .to_excel (f'./vacancies/vacancies-{ self .professional_role } .xlsx' )
174
+
175
+ def aggregateInfo (self ):
176
+ # get totalPages with first request
177
+ self .getVacancies ('?' .join ([
178
+ self .baseUrl ,
179
+ parse .urlencode ({
180
+ ** self .params ,
181
+ 'page' : 0
182
+ })
183
+ ]))
184
+
185
+ urls = []
186
+ for i in range (1 , self .totalPages + 1 ):
187
+ urls .append (self .baseUrl + '?' + parse .urlencode ({
188
+ ** self .params ,
189
+ 'page' : i
190
+ }))
191
+ for url in urls :
192
+ self .getVacancies (url )
193
+
194
+
195
+ vp = VacancyAggregator ()
196
+ vp .aggregateInfo ()
197
+ vp .saveToXlsx ()
198
+
199
+ # vp = VacancyAggregator(134)
200
+ # vp.aggregateInfo()
201
+ # vp.saveToXlsx()
202
+
203
+ # vp = VacancyAggregator(156)
204
+ # vp.aggregateInfo()
205
+ # vp.saveToXlsx()
206
+
207
+ # vp = VacancyAggregator(163)
208
+ # vp.aggregateInfo()
209
+ # vp.saveToXlsx()
210
+
211
+ # vp = VacancyAggregator(164)
212
+ # vp.aggregateInfo()
213
+ # vp.saveToXlsx()
0 commit comments