diff --git a/.gitignore b/.gitignore index b6e4761..9660b21 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,9 @@ +## Project specific files +Реестр членов_files/ +users.json +index.html +registry.html + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/ER_diag.png b/ER_diag.png new file mode 100644 index 0000000..a523dfe Binary files /dev/null and b/ER_diag.png differ diff --git a/README.md b/README.md new file mode 100644 index 0000000..8eb57c4 --- /dev/null +++ b/README.md @@ -0,0 +1,53 @@ +# projectparser + + +## Подготовка окружения + +``` +pip install -r req.txt +``` + +## Наполнение данных + +* Запуск парсинга +``` +python parserweb/parser.py +``` +* Наполнение БД +``` +python populate_db.py +``` +* Запуск приложения +``` +./run.sh +``` +## Модель данных + +![ER](ER_diag.png) + +Link + +https://dbdesigner.page.link/wCPjHn8Gwk7iNKeEA + + + +## TODO + +[TODO](TODO.md) + +## Типовые проблемы + +### Проблемы с импортом + +если не проходит импорт, например +``` +ModuleNotFoundError: No module named 'parser_core' +``` +Необходимо проверить: +``` +echo $PYTHONPATH +``` +если переменная пустая, то в корне проекта выполнить: +``` +export PYTHONPATH=$PWD +``` \ No newline at end of file diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..d027ae2 --- /dev/null +++ b/TODO.md @@ -0,0 +1,17 @@ +# Что нужно сделать + +## MVP +- [x] получить данные +- [x] парсинг первой страницы +- [x] парсинг данных пользователя +- [x] рефакторинг кода для 'недействующих пользователей' +- [x] рефакторинг DRY +- [x] спроектировать базу +- [x] вставить данные в базу +- [x] вывод базы данных в вебе +- [x] фильтр по полю статуса агентов +- [x] статус выгрузки (progress bar) +## Improvment +- [ ] парсинг с пагинацией +- [ ] реализовать логирование выгрузки данных +- [ ] асинхронные запросы к API diff --git a/db/__init__.py b/db/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/db/model.py b/db/model.py new file mode 100644 index 0000000..7e0cfea --- /dev/null +++ b/db/model.py @@ -0,0 +1,42 @@ +from flask_sqlalchemy import SQLAlchemy + + +db = SQLAlchemy() + +""" +class Ensurance(db.Model): + id = db.Column(db.Integer, primary_key=True) + ensurance_org=db.Column(db.String, nullable=True) +""" + +class Rsodata(db.Model): + id = db.Column(db.Integer, primary_key=True) + reestr_number = db.Column(db.Text) + satisfied = db.Column(db.String) + excluded = db.Column(db.Text, nullable=True) + stopped = db.Column(db.Text, nullable=True) + grade = db.Column(db.String, nullable=True) + lfm = db.Column(db.String, nullable=True) + compensation = db.Column(db.String, nullable=True) + experience = db.Column(db.String, nullable=True) + contacts = db.Column(db.Text, nullable=True) + url = db.Column(db.String, unique=True, nullable=True) + ensurance=db.Column(db.String, nullable=True) + def __repr__(self): + return ''.format(self.reestr_number, self.satisfied ,self.excluded ,self.url) + +""" +class User(db.Model): + id = db.Column(db.Integer, primary_key=True) + #firstname = db.Column(db.String, nullable=True) + #lastname = db.Column(db.String, nullable=True) + #middlename = db.Column(db.String, nullable=True) + lfm = db.Column(db.String, nullable=True) + compensation = db.Column(db.String, nullabale=True) + experience = db.Column(db.String, nullabale=True) + contacts = db.Column(db.Text, nullable=True) + #ensurance_id = db.Column(db.Integer, unique=True) + #rso_id = db.Column(db.Boolean, nullabale=True) + url = db.Column(db.String, unique=True, nullable=True) +""" + diff --git a/db_ddl.sql b/db_ddl.sql new file mode 100644 index 0000000..11e4d2a --- /dev/null +++ b/db_ddl.sql @@ -0,0 +1,26 @@ +CREATE TABLE Users ( + id integer PRIMARY KEY AUTOINCREMENT, + firstname varchar PRIMARY KEY AUTOINCREMENT, + lastname varchar PRIMARY KEY AUTOINCREMENT, + middlename varchar PRIMARY KEY AUTOINCREMENT, + compensation varchar, + experience integer, + contacts varchar, + ensurance_id integer, + rso_id blob, + url varchar +); + +CREATE TABLE rsodata ( + id integer PRIMARY KEY AUTOINCREMENT, + reestr_number integer, + satisfied blob, + excluded varchar, + stopped varchar, + grade varchar +); + +CREATE TABLE ensurance ( + id integer PRIMARY KEY AUTOINCREMENT, + ensurance_org varchar PRIMARY KEY AUTOINCREMENT +); \ No newline at end of file diff --git a/parser_core/__init__.py b/parser_core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/parser_core/get_person_info.py b/parser_core/get_person_info.py new file mode 100644 index 0000000..e22c1fd --- /dev/null +++ b/parser_core/get_person_info.py @@ -0,0 +1,97 @@ +from bs4 import BeautifulSoup +import requests +import re +from pprint import pprint + +headers = {} + +def get_user_info(url, headers): + response = requests.get(url, headers=headers) + return response.text + +def parse_table(soup): + try: + data = soup.find('div', class_ = 'reestr').find_all('table') + temp_dict = {} + for row in data: + # print(row) + key = row.find('td').text.strip() + try: + value = ' '.join(row.find('td', class_ = 'posrel').get_text(strip=True, separator='\n').split('\n')[:-1]).strip() + except: + value = None + temp_dict[key] = value + return temp_dict + except: + return {} + +def get_person(soup): + temp_dict = parse_table(soup) + satisfied = temp_dict.get('Соответствие условиям членства в СРО, предусмотренным законодательством Российской Федерации и (или) внутренними документами СРО') + stopped = temp_dict.get('Приостановка права осуществления оценочной деятельности') + grade = temp_dict.get('Степень членства') + ensurance = temp_dict.get('Страхование деятельности') + return { + 'bilet': temp_dict.get('Членский билет'), + 'grade': grade if grade else 'Не является членом', + 'satisfied': satisfied if satisfied else 'Не соответствует', + 'reestr_number': temp_dict.get('Номер в Реестре РОО'), + 'contacts': temp_dict.get('Контакты'), + 'organization': temp_dict.get('Организация (место работы)'), + 'experience': temp_dict.get('Стаж'), + 'ensurance': ensurance if ensurance else 'Страховая организация отсутствует', + 'compensation': temp_dict.get('Компенсационный фонд'), + 'excluded': temp_dict.get('Исключен', 'Действующий'), + 'stopped': stopped if stopped else 'Не приостановлено' + } + + +def cleanup_changed(content, idx): + if idx > len(content): + return None + try: + return ' '.join(content[idx].get_text(strip=True, separator='\n').split('\n')[:-1]) + except: + return None + +def parse_user_info(content, url): + soup = BeautifulSoup(content, "html.parser") + try: + lfm = soup.find("h3").text + except AttributeError: + lfm = None + try: + date_and_city_of_birth = soup.find("div", class_="reestr").find('p').text.strip() + city_of_birth = date_and_city_of_birth.split(',')[1].strip() + date_of_birth = date_and_city_of_birth.split(',')[0].split('-')[1].strip() + except: + city_of_birth = None + date_of_birth = None + try: + status = soup.find("table", class_="top").find('td').text.strip() + except: + status = "Действующий" + + expanded_dict = get_person(soup) + return { + 'lfm': lfm, + 'birth_date': date_of_birth, + 'birth_city': city_of_birth, + 'status': status, + 'url': url, + **expanded_dict + } + + +if __name__ == '__main__': + from parserweb.parser import headers + url = 'http://sroroo.ru/about/reestr/379972/' + #url = 'http://sroroo.ru/about/reestr/806693/' + # with open("index.html", "w") as file: + # data = get_user_info(url, headers) + # file.write(data) + with open('index.html', encoding='utf-8') as file: + data = file.read() + parsed_data = parse_user_info(data, url) + pprint(parsed_data) + \ No newline at end of file diff --git a/parserweb/__init__.py b/parserweb/__init__.py new file mode 100644 index 0000000..6a348be --- /dev/null +++ b/parserweb/__init__.py @@ -0,0 +1,65 @@ +from flask import Flask, render_template,Blueprint +from db.model import Rsodata +from db.model import db +from parserweb.parser import return_parsed_data +rso_data_column_dict = { + 'id': "Порядковый номер", + 'reestr_number': "Номер в реестре", + 'satisfied': "Соответствует ли", + 'excluded': "Является ли агентом", + 'stopped': "Прекращено ли членство", + 'grade': "Степень членства", + 'contacts': "Контакты", + 'organization': "Страховые компании", + 'experience': "Стаж", + 'ensurance': "Страховщик", + 'compensation': "Компенсационный фонд", + 'lfm': "ФИО", + 'url': "ссылка", + } +displayed_col = ['lfm', 'grade', 'excluded', 'url'] + +def create_app(): + columns = Rsodata.__table__.columns.keys() + app = Flask(__name__) + app.config.from_pyfile('config.py') + db.init_app(app) + @app.route('/') + def index(): + title = "Информация по агентам" + active_agents = ['lfm', 'grade', 'exluded', 'url'] + agent_list = Rsodata.query.all() + # print(columns) + #print(agent_list) + #print("Everything done") + #print(title, weather, news_list) + return render_template('info.html', page_title=title, + agent_list=agent_list, columns=columns, + rso_data_column_dict=rso_data_column_dict, + displayed_col=displayed_col) + # return render_template('debug.html') + @app.route('/active') + def filtered(): + title = "Информация активным по агентам" + agent_list = Rsodata.query.filter(Rsodata.grade.in_(['Действительный член РОО'])).all() + + # print(columns) + #print(agent_list) + #print("Everything done") + #print(title, weather, news_list) + return render_template('info.html', page_title=title, + agent_list=agent_list, columns=columns, + rso_data_column_dict=rso_data_column_dict, + displayed_col=displayed_col) + + # @app.route('/') + # def index(): + # title = "Информация по агентам оценки" + # person_list = return_parsed_data() + # return render_template('index.html', page_title=title, person_list=person_list) + + # blueprint = Blueprint('news', __name__) + + + + return app diff --git a/parserweb/config.py b/parserweb/config.py new file mode 100644 index 0000000..5d7bbcd --- /dev/null +++ b/parserweb/config.py @@ -0,0 +1,7 @@ +import os + +basedir = os.path.abspath(os.path.dirname(__file__)) +# print(basedir) +SQLALCHEMY_DATABASE_URI = 'sqlite:///' + os.path.join(basedir, '..' , 'projectparser.db') +BASE_URL = 'http://sroroo.ru' +SQLALCHEMY_TRACK_MODIFICATIONS = False diff --git a/parserweb/parser.py b/parserweb/parser.py new file mode 100644 index 0000000..da19e2f --- /dev/null +++ b/parserweb/parser.py @@ -0,0 +1,92 @@ +from pprint import pprint +import requests +from bs4 import BeautifulSoup +from os import sys +from parser_core.get_person_info import parse_user_info, get_user_info +import json +from progress.bar import IncrementalBar +from parserweb.config import BASE_URL +from db.model import db, Rsodata + +headers = { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.174 YaBrowser/22.1.3.856 (beta) Yowser/2.5 Safari/537.36" +} + +def get_data_from_file(filename): + # TODO: catch errors + with open(filename, encoding='utf-8') as file: + return file.read() + +def get_data_from_url(url): + try: + response = requests.get(url, headers=headers) + except Exception as e: + print("Something went wrong, while ") + print(e) + if not response.status_code == 200: + raise Exception("Status code should be 200") + return response.text # check what should be returned + +def get_data(source_type): + source_types = ['url', 'file'] + if source_type not in source_types: + sys.exit(1) + if source_type == 'url': + url = f'{BASE_URL}/about/reestr/?member_lname=&organization=' + return get_data_from_url(url) + if source_type == 'file': + return get_data_from_file('registry.html') + +def parse_main_page(): + content = get_data('url') + soup = BeautifulSoup(content, "html.parser") + all_rows = soup.find_all('td', class_="left-td") + # all_rows = table.find_all('tr') + person_list = [] + all_rows_len = len(all_rows) + bar = IncrementalBar('Getting user info', max = all_rows_len) + + for index, row in enumerate(all_rows): + href_obj = row.find('a') + url = BASE_URL+href_obj['href'] + bar.next() + # print(f"Getting data for persons {index+1}, remaining {all_rows_len-index-1} persons to get") + user_data = get_user_info(url, headers=headers) + user_parsed_data = parse_user_info(user_data, url) + error_list = [] + if not user_parsed_data['lfm']: + error_list.append((href_obj.text, url)) + continue + person_list.append(user_parsed_data) + bar.finish() + for error in error_list: + print(f"Can't get user info for {error[0]} from url {error[1]}") + return person_list + +def return_parsed_data(): + with open('users.json', 'r', encoding='utf-8') as file: + return json.load(file) + +def save_data(row): + rso = Rsodata(reestr_number=row.get('reestr_number'), + satisfied = row.get('satisfied'), excluded = row.get('excluded'), + stopped = row.get('stopped'), grade = row.get('grade'), + ensurance = row.get('ensurance'), + # user = User(#firstname = row.get('firstname'), + # #lastname = row.get('lastname'), + # #middlename = row.get('middlename'), + lfm = row.get('lfm'), + compensation = row.get('compensation'), + experience = row.get('experience'), + contacts = row.get('contacts'), + # ensurance_id = row.get('ensurance_id'), + # rso_id = row.get('rso_id'), + url = row.get('url') + ) + db.session.add(rso) + db.session.commit() + +if __name__ == '__main__': + result = parse_main_page() + with open('users.json', 'w', encoding='utf-8') as file: + json.dump(result, indent=4, fp=file, ensure_ascii=False) diff --git a/parserweb/templates/info.html b/parserweb/templates/info.html new file mode 100644 index 0000000..4c23d86 --- /dev/null +++ b/parserweb/templates/info.html @@ -0,0 +1,25 @@ +{% extends "index.html" %} +{% block content %} + + + + {% for column in displayed_col %} + + {% endfor %} + + + + + + {% for agent in agent_list %} + + {% for column in displayed_col %} + + {% endfor %} + + {% endfor %} + +
{{ rso_data_column_dict[column] }}
{{ agent[column] }}
+ + +{% endblock %} diff --git a/parserweb/templates/menu.html b/parserweb/templates/menu.html new file mode 100644 index 0000000..047d2f9 --- /dev/null +++ b/parserweb/templates/menu.html @@ -0,0 +1,11 @@ + \ No newline at end of file diff --git a/populate_db.py b/populate_db.py new file mode 100644 index 0000000..42b54b4 --- /dev/null +++ b/populate_db.py @@ -0,0 +1,10 @@ +from db.model import db +from parserweb import create_app + +from parserweb.parser import return_parsed_data, save_data +db.drop_all(app=create_app()) +db.create_all(app=create_app()) +app = create_app() +with app.app_context(): + for row in return_parsed_data(): + save_data(row) diff --git a/projectparser.db b/projectparser.db new file mode 100644 index 0000000..5df5362 Binary files /dev/null and b/projectparser.db differ diff --git a/req.txt b/req.txt new file mode 100644 index 0000000..49bfdb5 --- /dev/null +++ b/req.txt @@ -0,0 +1,20 @@ +beautifulsoup4==4.10.0 +certifi==2021.10.8 +charset-normalizer==2.0.12 +click==8.1.2 +flake8==4.0.1 +Flask==2.1.1 +idna==3.3 +importlib-metadata==4.11.3 +itsdangerous==2.1.2 +Jinja2==3.1.1 +MarkupSafe==2.1.1 +mccabe==0.6.1 +progress==1.6 +pycodestyle==2.8.0 +pyflakes==2.4.0 +requests==2.27.1 +soupsieve==2.3.2 +urllib3==1.26.9 +Werkzeug==2.1.1 +zipp==3.8.0 diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..a372b04 --- /dev/null +++ b/run.sh @@ -0,0 +1,4 @@ +#!/bin/bash +export FLASK_APP=parserweb +export FLASK_ENV=development +flask run