diff --git a/citypopsortedbyratingcsv.ipynb b/citypopsortedbyratingcsv.ipynb new file mode 100644 index 0000000..18842b9 --- /dev/null +++ b/citypopsortedbyratingcsv.ipynb @@ -0,0 +1,463 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from rymscraper import rymscraper,RymUrl\n", + "from typing import List, Dict\n", + "import logging\n", + "import time\n", + "from bs4 import BeautifulSoup\n", + "from selenium import webdriver\n", + "from selenium.webdriver.firefox.options import Options\n", + "from selenium.webdriver.common.by import By\n", + "import logging\n", + "import re\n", + "import time\n", + "import difflib\n", + "from tqdm import tqdm\n", + "from bs4 import BeautifulSoup, NavigableString, element\n", + "from selenium.webdriver.common.by import By\n", + "from typing import List\n", + "from rapidfuzz import fuzz, process" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "logger = logging.getLogger(__name__)\n", + "\n", + "\n", + "class RymBrowser(webdriver.Firefox):\n", + " def __init__(self, headless=True):\n", + " logger.debug(\"Starting Selenium Browser : headless = %s\", headless)\n", + " self.options = Options()\n", + " if headless:\n", + " self.options.headless = True\n", + "\n", + " webdriver.Firefox.__init__(self, options=self.options)\n", + "\n", + " def restart(self):\n", + " self.quit()\n", + " webdriver.Firefox.__init__(self, options=self.options)\n", + "\n", + " def get_url(self, url):\n", + " logger.debug(\"get_url(browser, %s)\", url)\n", + " while True:\n", + " self.get(str(url))\n", + " class_to_click_on = [\n", + " \"as-oil__btn-optin\", # cookie bar\n", + " \"fc-cta-consent\", # consent popup\n", + " # \"ad-close-button\", # advertisement banner\n", + " ]\n", + " for i in class_to_click_on:\n", + " if len(self.find_elements(By.CLASS_NAME, i)) > 0:\n", + " self.find_element(By.CLASS_NAME, i).click()\n", + " logger.debug(f\"{i} found. Clicking on it.\")\n", + "\n", + " if len(self.find_elements(By.CLASS_NAME, \"disco_expand_section_link\")) > 0:\n", + " try:\n", + " for index, link in enumerate(\n", + " self.find_elements(By.CLASS_NAME, \"disco_expand_section_link\")\n", + " ):\n", + " self.execute_script(\n", + " f\"document.getElementsByClassName('disco_expand_section_link')[{index}].scrollIntoView(true);\"\n", + " )\n", + " link.click()\n", + " time.sleep(0.2)\n", + " except Exception as e:\n", + " logger.debug('No \"Show all\" links found : %s.', e)\n", + " # Test if IP is banned.\n", + " if self.is_ip_banned():\n", + " logger.error(\n", + " \"IP banned from rym. Can't do any requests to the website. Exiting.\"\n", + " )\n", + " self.quit()\n", + " exit()\n", + " # Test if browser is rate-limited.\n", + " if self.is_rate_limited():\n", + " logger.error(\"Rate-limit detected. Restarting browser.\")\n", + " self.restart()\n", + " else:\n", + " break\n", + " return\n", + "\n", + " def get_soup(self):\n", + " return BeautifulSoup(self.page_source, \"lxml\")\n", + "\n", + " def is_ip_banned(self):\n", + " logger.debug(\"soup.title : %s\", self.get_soup().title)\n", + " return self.get_soup().title.text.strip() == \"IP blocked\"\n", + "\n", + " def is_rate_limited(self):\n", + " return self.get_soup().find(\"form\", {\"id\": \"sec_verify\"})\n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "def get_chart_row_infos(row: element.Tag) -> dict:\n", + " \"\"\"Returns a dict containing infos from a chart row.\"\"\"\n", + " dict_row = {}\n", + " try:\n", + " dict_row[\"Rank\"] = row.get(\"id\").replace(\"pos\", \"\")\n", + " except Exception as e:\n", + " logger.error(\"Error when fetching Rank: %s\", e)\n", + " dict_row[\"Rank\"] = \"NA\"\n", + " try:\n", + " artist_div = dict_row[\"Artist\"] = row.find(\n", + " \"div\",\n", + " {\"class\": \"page_charts_section_charts_item_credited_links_primary\"},\n", + " )\n", + " romanized_version_span = artist_div.find(\n", + " \"span\", {\"class\": \"ui_name_locale_language\"}\n", + " )\n", + "\n", + " original_name_span = artist_div.find(\n", + " \"span\", {\"class\": \"ui_name_locale_original\"}\n", + " )\n", + "\n", + " if romanized_version_span:\n", + " dict_row[\n", + " \"Artist\"\n", + " ] = f\"{romanized_version_span.text} [{original_name_span.text}]\"\n", + " elif original_name_span:\n", + " dict_row[\"Artist\"] = original_name_span.text\n", + " else:\n", + " dict_row[\"Artist\"] = artist_div.text\n", + "\n", + " dict_row[\"Artist\"] = dict_row[\"Artist\"].replace(\"\\n\", \"\")\n", + " except Exception as e:\n", + " logger.error(\"Error when fetching Artist: %s\", e)\n", + " dict_row[\"Artist\"] = \"NA\"\n", + " try:\n", + " dict_row[\"Album\"] = row.find(\n", + " \"div\",\n", + " {\"class\": \"page_charts_section_charts_item_title\"},\n", + " ).text.replace(\"\\n\", \"\")\n", + " logger.debug(\n", + " \"%s - %s - %s\",\n", + " dict_row[\"Rank\"],\n", + " dict_row[\"Artist\"],\n", + " dict_row[\"Album\"],\n", + " )\n", + " except Exception as e:\n", + " logger.error(\"Error when fetching Album: %s\", e)\n", + " dict_row[\"Album\"] = \"NA\"\n", + " try:\n", + " dict_row[\"Date\"] = (\n", + " row.find(\"div\", {\"class\": \"page_charts_section_charts_item_date\"})\n", + " .find_all(\"span\")[0]\n", + " .text.replace(\"\\n\", \"\")\n", + " .strip()\n", + " )\n", + " except Exception as e:\n", + " logger.error(\"Error when fetching Date: %s\", e)\n", + " dict_row[\"Date\"] = \"NA\"\n", + " try:\n", + " dict_row[\"Genres\"] = \", \".join(\n", + " [\n", + " x.text\n", + " for x in row.find(\n", + " \"div\", {\"class\": \"page_charts_section_charts_item_genres_primary\"}\n", + " ).find_all(\"a\", {\"class\": \"genre\"})\n", + " ]\n", + " )\n", + " except Exception as e:\n", + " logger.error(\"Error when fetching Genres: %s\", e)\n", + " dict_row[\"Genres\"] = \"NA\"\n", + " try:\n", + " dict_row[\"RYM Rating\"] = row.find(\n", + " \"span\", {\"class\": \"page_charts_section_charts_item_details_average_num\"}\n", + " ).text\n", + " except Exception as e:\n", + " logger.error(\"Error when fetching RYM Rating: %s\", e)\n", + " dict_row[\"RYM Rating\"] = \"NA\"\n", + " try:\n", + " dict_row[\"Ratings\"] = (\n", + " row.find_all(\"span\", {\"class\": \"full\"})[0]\n", + " .text.replace(\"\\n\", \"\")\n", + " .replace(\" \", \"\")\n", + " )\n", + " except Exception as e:\n", + " logger.error(\"Error when fetching Ratings: %s\", e)\n", + " dict_row[\"Ratings\"] = \"NA\"\n", + " try:\n", + " dict_row[\"Reviews\"] = (\n", + " row.find_all(\"span\", {\"class\": \"full\"})[1]\n", + " .text.replace(\"\\n\", \"\")\n", + " .replace(\" \", \"\")\n", + " )\n", + " except Exception as e:\n", + " logger.error(\"Error when fetching Reviews: %s\", e)\n", + " dict_row[\"Reviews\"] = \"NA\"\n", + " return dict_row\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "def get_chart_infos(max_page: int = None) -> List[Dict]:\n", + " \"\"\"Returns a list of dicts containing chart infos.\n", + "\n", + " Parameters:\n", + " url: An url for a chart. Can be created with the RymUrl helper.\n", + " See the get_chart.py script in the examples folder for an example.\n", + " max_page: The max number of pages to extract from the chart.\n", + "\n", + " Returns:\n", + " list_rows: List of dicts for each rows from the chart.\n", + "\n", + " \"\"\"\n", + " browser = RymBrowser(headless=True)\n", + " #for page in range(1,max_page+1):\n", + " #url=f\"https://rateyourmusic.com/charts/top/album/all-time/g:city-pop/{page}/\"\n", + " page=1\n", + " #logger.info(\"Extracting chart informations for %s.\", url)\n", + "\n", + " list_rows = []\n", + " while True:\n", + " try:\n", + " url=f\"https://rateyourmusic.com/charts/top/album/all-time/g:city-pop/{page}/\"\n", + " browser.get_url(url)\n", + " logger.debug(\"Extracting chart rows for url %s\", url)\n", + " soup = browser.get_soup()\n", + "\n", + " # table containing albums\n", + " if soup.find(\"sections\", {\"id\": \"page_sections_charts\"}):\n", + " logger.debug(\"Table containing chart elements found\")\n", + " table = soup.find(\"section\", {\"id\": \"page_charts_section_charts\"})\n", + " rows = table.find_all(\n", + " \"div\", {\"class\": \"page_section_charts_item_wrapper\"}\n", + " )\n", + " if len(rows) == 0:\n", + " logger.debug(\"No rows extracted. Exiting\")\n", + " break\n", + " for row in rows:\n", + " # don't parse ads\n", + " if not row.find(\"script\"):\n", + " dict_row = get_chart_row_infos(row)\n", + " list_rows.append(dict_row)\n", + " else:\n", + " logger.warning(\"Table class mbgen not found\")\n", + " break\n", + "\n", + " # link to the next page\n", + " if soup.find(\"a\", {\"class\": \"ui_pagination_next\"}):\n", + " logger.debug(\"Next page found\")\n", + " if max_page and page == max_page:\n", + " break\n", + " page += 1\n", + " soup.decompose()\n", + " try:\n", + " url=f\"https://rateyourmusic.com/charts/top/album/all-time/g:city-pop/{page}/\"\n", + " browser.get_url(url)\n", + " soup = browser.get_soup()\n", + " except Exception as e:\n", + " logger.error(e)\n", + " break\n", + " else:\n", + " logger.debug(\"No next page found. Exiting.\")\n", + " break\n", + " except Exception as e:\n", + " logger.error(\"Error scraping page %s : %s\", url, e)\n", + " break\n", + "\n", + " return list_rows" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\navin\\AppData\\Local\\Temp\\ipykernel_9324\\2028168715.py:9: DeprecationWarning: headless property is deprecated, instead use add_argument('-headless')\n", + " self.options.headless = True\n" + ] + } + ], + "source": [ + "#rym_url = RymUrl.RymUrl() # default: top of all-time. See examples/get_chart.py source code for more options.\n", + "chart_infos = get_chart_infos(max_page=30)\n", + "df = pd.DataFrame(chart_infos)\n", + "df=df[['Rank', 'Artist', 'Album', 'RYM Rating', 'Ratings']]" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "df.to_csv('allcitypop4.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "#sorted_df = df.sort_values(by=['Ratings'], ascending=False) //sort by popularity\n", + "#sorted_df = df.sort_values(by=['RYM Rating'], ascending=False) //sort by rating\n", + "#df['RYM Rating'].quantile(q=0.9)\n", + "df['RYM Rating'] = df['RYM Rating'].astype(float)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [], + "source": [ + "df['Ratings'] = df['Ratings'].str.replace(',', '').astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3.63" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['RYM Rating'].quantile(q=0.9)#q=0.9 means top 10% albums raed" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "17.0" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['Ratings'].quantile(q=0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "sorted_df = df.sort_values(by=['RYM Rating'], ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "sorted_df.to_csv('allcitypop5.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Rank Artist Album RYM Rating Ratings\n", + "0 1 Tatsuro Yamashita [山下達郎] Ride on Time 3.89 2444\n", + "1 2 Tatsuro Yamashita [山下達郎] For You 3.83 4231\n", + "2 3 Anri [杏里] Timely!! 3.79 3385\n", + "3 4 Masayoshi Takanaka [高中正義] Seychelles 3.76 1222\n", + "4 5 Masayoshi Takanaka [高中正義] The Rainbow Goblins 3.73 1170\n" + ] + } + ], + "source": [ + "print(df.head())" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "df.to_csv('CITYPOP.csv',encoding='utf-8-sig')" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "sorted_df.to_csv('CITYPOPSORTED.csv',encoding='utf-8-sig')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}