alexstulov · Feb 29, 2024
diff --git a/‎.gitignore
+161 b/‎.gitignore
+161
diff --git a/‎data/vacancies-10.xlsx
3.22 MB b/‎data/vacancies-10.xlsx
3.22 MB
diff --git a/‎data/vacancies-134.xlsx
1.68 MB b/‎data/vacancies-134.xlsx
1.68 MB
diff --git a/‎data/vacancies-156.xlsx
1.09 MB b/‎data/vacancies-156.xlsx
1.09 MB
diff --git a/‎data/vacancies-163.xlsx
970 KB b/‎data/vacancies-163.xlsx
970 KB
diff --git a/‎data/vacancies-164.xlsx
592 KB b/‎data/vacancies-164.xlsx
592 KB
diff --git a/‎index.py
+213 b/‎index.py
+213
@@ -0,0 +1,161 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+.vscode/
@@ -0,0 +1,213 @@
+import requests
+from urllib import parse
+import time
+import os
+import math
+import pandas as pd
+
+REQUEST_PAUSE = 0.2 # experimental pause not to bump into requests/minute limit
+
+class VacancyAggregator:
+    baseUrl = 'https://api.hh.ru/vacancies/'
+    totalPages = float('inf')
+    vacancies = {}
+    preparedVacancies = []
+    professional_role = 10
+    params = {}
+    
+    def make_a_list(self):
+        return []
+    
+    def make_a_dictionary(self):
+        return {}
+    
+    def __init__(self, role = 10):
+        self.professional_role = role
+        self.params = {
+            'area': 1, # 1 - москва
+            'professional_role': role,
+                # 10 - аналитик
+                # 134 - финансовый аналитик, инвестиционный аналитик
+                # 156 - BI-аналитик, аналитик данных
+                # 163 - маркетолог-аналитик
+                # 164 - продуктовый аналитик
+            'per_page': 100
+        }
+        self.vacancies = self.make_a_dictionary()
+        self.preparedVacancies = self.make_a_list()
+        print('VacancyAggregator constructed')
+        
+    def getVacancy(self, url):
+        data = dict()
+        try:
+            response = requests.get(url)
+            data = response.json()
+            if response.status_code == 200:
+                print('ok', url)
+            else:
+                print('not ok', response.status_code, url)
+            response.close()
+        except Exception as exc:
+            print(f'{url} made exception: {exc}')
+        return data
+    
+    def getVacancies(self, url):
+        print(f'getting vacancies for {self.professional_role} and params: {self.params}')
+        data = {}
+        try:
+            response = requests.get(url)
+            data = response.json()
+        except Exception as exc:
+            print(f'{url} made exception: {exc}')
+        if (math.isinf(self.totalPages)):
+            self.totalPages = data['pages']
+        if 'items' not in data:
+            print(url, ' failed to load data')
+            response.close()
+            return    
+        for item in data['items']:
+            time.sleep(REQUEST_PAUSE)
+            details = self.getVacancy(self.baseUrl + item['id'])
+            newVacancy = {**item, **details}
+            self.vacancies[item['id']] = newVacancy
+        response.close()
+        
+    def prepareVacancy (self, vacancyId, vacancy):
+        if vacancy['salary'] != None:
+            salary_from = vacancy['salary']['from']
+            salary_to = vacancy['salary']['to']
+        else:
+            salary_from = None
+            salary_to = None
+        if vacancy['address'] != None:
+            address_raw = vacancy['address']['raw']
+        else:
+            address_raw = None
+        if 'key_skills' in vacancy and vacancy['key_skills'] != None:
+            def getSkillName(skill):
+                return skill['name']
+            keySkills = ','.join(map(getSkillName, vacancy['key_skills']))
+        else:
+            keySkills = ''
+        if 'description' in vacancy and vacancy['description'] != None:
+            description = vacancy['description']
+        else:
+            description = ''
+        self.preparedVacancies.append([
+            vacancy['id'],
+            vacancy['premium'],
+            vacancy['name'],
+            vacancy['has_test'],
+            vacancy['response_letter_required'],
+            vacancy['area']['id'],
+            vacancy['area']['name'],
+            salary_from, 
+            salary_to,
+            vacancy['type']['name'],
+            address_raw,
+            vacancy['response_url'],
+            vacancy['sort_point_distance'],
+            vacancy['published_at'],
+            vacancy['created_at'],
+            vacancy['archived'],
+            description,
+            keySkills,
+            vacancy['apply_alternate_url'],
+            vacancy['insider_interview'],
+            vacancy['url'],
+            vacancy['alternate_url'],
+            vacancy['relations'],
+            vacancy['employer']['name'],
+            vacancy['snippet']['requirement'],
+            vacancy['snippet']['responsibility'],
+            vacancy['contacts'],
+            vacancy['schedule']['name'],
+            vacancy['working_days'],
+            vacancy['working_time_intervals'],
+            vacancy['working_time_modes'],
+            vacancy['accept_temporary']
+        ])
+        
+    def saveToXlsx(self):
+        os.makedirs('./data/', exist_ok=True)
+
+        for vacancyId, vacancy in self.vacancies.items():
+            self.prepareVacancy(vacancyId, vacancy)
+
+        vacanciesDF = pd.DataFrame(self.preparedVacancies,
+            columns = [
+                'id',
+                'premium',
+                'name',
+                'has_test',
+                'response_letter_required',
+                'area_id',
+                'area_name',
+                'salary_from', 
+                'salary_to',
+                'type_name',
+                'address_raw',
+                'response_url',
+                'sort_point_distance',
+                'published_at',
+                'created_at',
+                'archived',
+                'description',
+                'key_skills',
+                'apply_alternate_url',
+                'insider_interview',
+                'url',
+                'alternate_url',
+                'relations',
+                'employer_name',
+                'snippet_requirement',
+                'snippet_responsibility',
+                'contacts',
+                'schedule_name',
+                'working_days',
+                'working_time_intervals',
+                'working_time_modes',
+                'accept_temporary'
+            ]
+        )
+        vacanciesDF.to_excel(f'./vacancies/vacancies-{self.professional_role}.xlsx')
+    
+    def aggregateInfo(self):
+        # get totalPages with first request
+        self.getVacancies('?'.join([
+            self.baseUrl,
+            parse.urlencode({
+                **self.params,
+                'page': 0
+            })
+        ]))
+
+        urls = []
+        for i in range(1, self.totalPages+1):
+            urls.append(self.baseUrl + '?' + parse.urlencode({
+                **self.params,
+                'page': i
+            }))
+        for url in urls:
+            self.getVacancies(url)
+
+
+vp = VacancyAggregator()
+vp.aggregateInfo()
+vp.saveToXlsx()
+
+# vp = VacancyAggregator(134)
+# vp.aggregateInfo()
+# vp.saveToXlsx()
+
+# vp = VacancyAggregator(156)
+# vp.aggregateInfo()
+# vp.saveToXlsx()
+
+# vp = VacancyAggregator(163)
+# vp.aggregateInfo()
+# vp.saveToXlsx()
+
+# vp = VacancyAggregator(164)
+# vp.aggregateInfo()
+# vp.saveToXlsx()