Skip to content

Commit 6a3d58d

Browse files
committedFeb 29, 2024
Initial commit
0 parents  commit 6a3d58d

7 files changed

+374
-0
lines changed
 

‎.gitignore

+161
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
# Byte-compiled / optimized / DLL files
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
6+
# C extensions
7+
*.so
8+
9+
# Distribution / packaging
10+
.Python
11+
build/
12+
develop-eggs/
13+
dist/
14+
downloads/
15+
eggs/
16+
.eggs/
17+
lib/
18+
lib64/
19+
parts/
20+
sdist/
21+
var/
22+
wheels/
23+
share/python-wheels/
24+
*.egg-info/
25+
.installed.cfg
26+
*.egg
27+
MANIFEST
28+
29+
# PyInstaller
30+
# Usually these files are written by a python script from a template
31+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
32+
*.manifest
33+
*.spec
34+
35+
# Installer logs
36+
pip-log.txt
37+
pip-delete-this-directory.txt
38+
39+
# Unit test / coverage reports
40+
htmlcov/
41+
.tox/
42+
.nox/
43+
.coverage
44+
.coverage.*
45+
.cache
46+
nosetests.xml
47+
coverage.xml
48+
*.cover
49+
*.py,cover
50+
.hypothesis/
51+
.pytest_cache/
52+
cover/
53+
54+
# Translations
55+
*.mo
56+
*.pot
57+
58+
# Django stuff:
59+
*.log
60+
local_settings.py
61+
db.sqlite3
62+
db.sqlite3-journal
63+
64+
# Flask stuff:
65+
instance/
66+
.webassets-cache
67+
68+
# Scrapy stuff:
69+
.scrapy
70+
71+
# Sphinx documentation
72+
docs/_build/
73+
74+
# PyBuilder
75+
.pybuilder/
76+
target/
77+
78+
# Jupyter Notebook
79+
.ipynb_checkpoints
80+
81+
# IPython
82+
profile_default/
83+
ipython_config.py
84+
85+
# pyenv
86+
# For a library or package, you might want to ignore these files since the code is
87+
# intended to run in multiple environments; otherwise, check them in:
88+
# .python-version
89+
90+
# pipenv
91+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
93+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
94+
# install all needed dependencies.
95+
#Pipfile.lock
96+
97+
# poetry
98+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99+
# This is especially recommended for binary packages to ensure reproducibility, and is more
100+
# commonly ignored for libraries.
101+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102+
#poetry.lock
103+
104+
# pdm
105+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106+
#pdm.lock
107+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108+
# in version control.
109+
# https://pdm.fming.dev/#use-with-ide
110+
.pdm.toml
111+
112+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113+
__pypackages__/
114+
115+
# Celery stuff
116+
celerybeat-schedule
117+
celerybeat.pid
118+
119+
# SageMath parsed files
120+
*.sage.py
121+
122+
# Environments
123+
.env
124+
.venv
125+
env/
126+
venv/
127+
ENV/
128+
env.bak/
129+
venv.bak/
130+
131+
# Spyder project settings
132+
.spyderproject
133+
.spyproject
134+
135+
# Rope project settings
136+
.ropeproject
137+
138+
# mkdocs documentation
139+
/site
140+
141+
# mypy
142+
.mypy_cache/
143+
.dmypy.json
144+
dmypy.json
145+
146+
# Pyre type checker
147+
.pyre/
148+
149+
# pytype static type analyzer
150+
.pytype/
151+
152+
# Cython debug symbols
153+
cython_debug/
154+
155+
# PyCharm
156+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158+
# and can be added to the global gitignore or merged into this file. For a more nuclear
159+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
160+
#.idea/
161+
.vscode/

‎data/vacancies-10.xlsx

3.22 MB
Binary file not shown.

‎data/vacancies-134.xlsx

1.68 MB
Binary file not shown.

‎data/vacancies-156.xlsx

1.09 MB
Binary file not shown.

‎data/vacancies-163.xlsx

970 KB
Binary file not shown.

‎data/vacancies-164.xlsx

592 KB
Binary file not shown.

‎index.py

+213
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
import requests
2+
from urllib import parse
3+
import time
4+
import os
5+
import math
6+
import pandas as pd
7+
8+
REQUEST_PAUSE = 0.2 # experimental pause not to bump into requests/minute limit
9+
10+
class VacancyAggregator:
11+
baseUrl = 'https://api.hh.ru/vacancies/'
12+
totalPages = float('inf')
13+
vacancies = {}
14+
preparedVacancies = []
15+
professional_role = 10
16+
params = {}
17+
18+
def make_a_list(self):
19+
return []
20+
21+
def make_a_dictionary(self):
22+
return {}
23+
24+
def __init__(self, role = 10):
25+
self.professional_role = role
26+
self.params = {
27+
'area': 1, # 1 - москва
28+
'professional_role': role,
29+
# 10 - аналитик
30+
# 134 - финансовый аналитик, инвестиционный аналитик
31+
# 156 - BI-аналитик, аналитик данных
32+
# 163 - маркетолог-аналитик
33+
# 164 - продуктовый аналитик
34+
'per_page': 100
35+
}
36+
self.vacancies = self.make_a_dictionary()
37+
self.preparedVacancies = self.make_a_list()
38+
print('VacancyAggregator constructed')
39+
40+
def getVacancy(self, url):
41+
data = dict()
42+
try:
43+
response = requests.get(url)
44+
data = response.json()
45+
if response.status_code == 200:
46+
print('ok', url)
47+
else:
48+
print('not ok', response.status_code, url)
49+
response.close()
50+
except Exception as exc:
51+
print(f'{url} made exception: {exc}')
52+
return data
53+
54+
def getVacancies(self, url):
55+
print(f'getting vacancies for {self.professional_role} and params: {self.params}')
56+
data = {}
57+
try:
58+
response = requests.get(url)
59+
data = response.json()
60+
except Exception as exc:
61+
print(f'{url} made exception: {exc}')
62+
if (math.isinf(self.totalPages)):
63+
self.totalPages = data['pages']
64+
if 'items' not in data:
65+
print(url, ' failed to load data')
66+
response.close()
67+
return
68+
for item in data['items']:
69+
time.sleep(REQUEST_PAUSE)
70+
details = self.getVacancy(self.baseUrl + item['id'])
71+
newVacancy = {**item, **details}
72+
self.vacancies[item['id']] = newVacancy
73+
response.close()
74+
75+
def prepareVacancy (self, vacancyId, vacancy):
76+
if vacancy['salary'] != None:
77+
salary_from = vacancy['salary']['from']
78+
salary_to = vacancy['salary']['to']
79+
else:
80+
salary_from = None
81+
salary_to = None
82+
if vacancy['address'] != None:
83+
address_raw = vacancy['address']['raw']
84+
else:
85+
address_raw = None
86+
if 'key_skills' in vacancy and vacancy['key_skills'] != None:
87+
def getSkillName(skill):
88+
return skill['name']
89+
keySkills = ','.join(map(getSkillName, vacancy['key_skills']))
90+
else:
91+
keySkills = ''
92+
if 'description' in vacancy and vacancy['description'] != None:
93+
description = vacancy['description']
94+
else:
95+
description = ''
96+
self.preparedVacancies.append([
97+
vacancy['id'],
98+
vacancy['premium'],
99+
vacancy['name'],
100+
vacancy['has_test'],
101+
vacancy['response_letter_required'],
102+
vacancy['area']['id'],
103+
vacancy['area']['name'],
104+
salary_from,
105+
salary_to,
106+
vacancy['type']['name'],
107+
address_raw,
108+
vacancy['response_url'],
109+
vacancy['sort_point_distance'],
110+
vacancy['published_at'],
111+
vacancy['created_at'],
112+
vacancy['archived'],
113+
description,
114+
keySkills,
115+
vacancy['apply_alternate_url'],
116+
vacancy['insider_interview'],
117+
vacancy['url'],
118+
vacancy['alternate_url'],
119+
vacancy['relations'],
120+
vacancy['employer']['name'],
121+
vacancy['snippet']['requirement'],
122+
vacancy['snippet']['responsibility'],
123+
vacancy['contacts'],
124+
vacancy['schedule']['name'],
125+
vacancy['working_days'],
126+
vacancy['working_time_intervals'],
127+
vacancy['working_time_modes'],
128+
vacancy['accept_temporary']
129+
])
130+
131+
def saveToXlsx(self):
132+
os.makedirs('./data/', exist_ok=True)
133+
134+
for vacancyId, vacancy in self.vacancies.items():
135+
self.prepareVacancy(vacancyId, vacancy)
136+
137+
vacanciesDF = pd.DataFrame(self.preparedVacancies,
138+
columns = [
139+
'id',
140+
'premium',
141+
'name',
142+
'has_test',
143+
'response_letter_required',
144+
'area_id',
145+
'area_name',
146+
'salary_from',
147+
'salary_to',
148+
'type_name',
149+
'address_raw',
150+
'response_url',
151+
'sort_point_distance',
152+
'published_at',
153+
'created_at',
154+
'archived',
155+
'description',
156+
'key_skills',
157+
'apply_alternate_url',
158+
'insider_interview',
159+
'url',
160+
'alternate_url',
161+
'relations',
162+
'employer_name',
163+
'snippet_requirement',
164+
'snippet_responsibility',
165+
'contacts',
166+
'schedule_name',
167+
'working_days',
168+
'working_time_intervals',
169+
'working_time_modes',
170+
'accept_temporary'
171+
]
172+
)
173+
vacanciesDF.to_excel(f'./vacancies/vacancies-{self.professional_role}.xlsx')
174+
175+
def aggregateInfo(self):
176+
# get totalPages with first request
177+
self.getVacancies('?'.join([
178+
self.baseUrl,
179+
parse.urlencode({
180+
**self.params,
181+
'page': 0
182+
})
183+
]))
184+
185+
urls = []
186+
for i in range(1, self.totalPages+1):
187+
urls.append(self.baseUrl + '?' + parse.urlencode({
188+
**self.params,
189+
'page': i
190+
}))
191+
for url in urls:
192+
self.getVacancies(url)
193+
194+
195+
vp = VacancyAggregator()
196+
vp.aggregateInfo()
197+
vp.saveToXlsx()
198+
199+
# vp = VacancyAggregator(134)
200+
# vp.aggregateInfo()
201+
# vp.saveToXlsx()
202+
203+
# vp = VacancyAggregator(156)
204+
# vp.aggregateInfo()
205+
# vp.saveToXlsx()
206+
207+
# vp = VacancyAggregator(163)
208+
# vp.aggregateInfo()
209+
# vp.saveToXlsx()
210+
211+
# vp = VacancyAggregator(164)
212+
# vp.aggregateInfo()
213+
# vp.saveToXlsx()

0 commit comments

Comments
 (0)
Please sign in to comment.