diff --git a/GSC_Export_for_Content_Graph.ipynb b/GSC_Export_for_Content_Graph.ipynb new file mode 100644 index 0000000..4394b0b --- /dev/null +++ b/GSC_Export_for_Content_Graph.ipynb @@ -0,0 +1,543 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "GSC Export for Content Graph.ipynb", + "provenance": [], + "collapsed_sections": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.5" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "68999de7732d48d2891c6246f6801460": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DropdownModel", + "state": { + "_options_labels": [ + "https://lolsurprise.mgae.com/", + "https://babyborn.mgae.com/", + "sc-domain:contentaudience.com", + "sc-domain:liston.io", + "sc-domain:servedontsell.co", + "https://extricareusa.com/", + "https://lalaloopsy.mgae.com/", + "https://www.mgae.com/", + "https://whatsinmypursesurprise.mgae.com/", + "https://anchoradvisors.com/", + "sc-domain:philipmorganconsulting.com", + "http://polycor.com/", + "https://indiyoung.com/", + "https://smithhonig.com/", + "https://poopsie.mgae.com/", + "sc-domain:polycor.com", + "https://www.lolsurprise.com/", + "http://www.nfcamenitymanagement.com/", + "sc-domain:anchoradvisors.com", + "sc-domain:designasylumblog.com", + "sc-domain:plumflowersoftware.com", + "https://www.2percentcommission.com/", + "https://inboundfound.com/", + "https://sbstandard.com/", + "https://dojobattle.mgae.com/", + "sc-domain:littletikes.com", + "https://shreddinsharks.mgae.com/", + "https://numnoms.mgae.com/", + "http://themoriuchigroup.com/", + "https://www.servedontsell.co/", + "https://hardscapes.polycor.com/", + "sc-domain:servedontsell.com", + "https://virorides.mgae.com/", + "http://philipmorganconsulting.com/", + "https://contentaudience.com/", + "https://www.crresearch.com/", + "https://bratz.mgae.com/", + "https://nation.everspaces.com/", + "https://servedontsell.com/", + "https://glamgoo.mgae.com/", + "https://philipmorganconsulting.com/", + "http://www.anchoradvisors.com/", + "https://bodyunburdened.com/", + "https://polycor.com/", + "https://nananasurprise.mgae.com/", + "https://thehangrees.mgae.com/", + "sc-domain:smithhonig.com", + "https://straightupcraft.com/", + "https://blog.polycor.com/", + "sc-domain:mgae.com", + "https://www.everspaces.com/", + "http://www.jerseyicecreamco.com/", + "https://www.polycor.com/", + "http://hardscapes.polycor.com/", + "sc-domain:metahelm.com", + "http://inthedragonslair.blogspot.com/", + "http://www.contentaudience.com/", + "https://gamerox.mgae.com/", + "sc-domain:2percentcommission.com", + "http://www.2percentcommission.com/", + "https://rainbowsinpieces.mgae.com/", + "https://projectmc2.mgae.com/", + "https://lovefraud.com/", + "https://www.indianalimestonecompany.com/", + "sc-domain:sbstandard.com", + "https://kingdombuilders.mgae.com/", + "sc-domain:inthedragonslair.blogspot.com", + "https://fitnesstestdrive.com/", + "http://www.littletikes.com/", + "https://www.littletikes.com/", + "http://www.polycor.com/", + "https://www.liston.io/", + "sc-domain:incourage.com", + "https://www.jerseyicecreamco.com/", + "https://wreckroyale.mgae.com/" + ], + "_view_name": "DropdownView", + "style": "IPY_MODEL_05ec27ff33674c24a3ca08654b172f91", + "_dom_classes": [], + "description": "Site: ", + "_model_name": "DropdownModel", + "index": 52, + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_view_count": null, + "disabled": false, + "_view_module_version": "1.5.0", + "description_tooltip": null, + "_model_module": "@jupyter-widgets/controls", + "layout": "IPY_MODEL_b5b7d1857db04d32b3392017cbb3f038" + } + }, + "05ec27ff33674c24a3ca08654b172f91": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "state": { + "_view_name": "StyleView", + "_model_name": "DescriptionStyleModel", + "description_width": "", + "_view_module": "@jupyter-widgets/base", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.2.0", + "_model_module": "@jupyter-widgets/controls" + } + }, + "b5b7d1857db04d32b3392017cbb3f038": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": null, + "width": null, + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": null, + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": null, + "left": null + } + } + } + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "BeQquQKYb5Ds" + }, + "source": [ + "# GSC to CSV\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "azyvUd1RV5Kj" + }, + "source": [ + "\n", + "#### Search Console Data\n", + "\n", + "##### Getting a Google API Credential.\n", + "\n", + "1) Sign up for a new project in the Google APIs console at https://code.google.com/apis/console.\n", + "\n", + "2) Choose \"Credentials\" on the left-hand menu. \n", + "\n", + "3) Choose \"Create Credentials\"\n", + "\n", + "4) Generate an API key for your application.\n", + "\n", + "5) Copy your client ID, client secret, and redirect URL into the next cell\n", + "\n", + "get data by week if not recursively go through the data daily\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "o_hn6RDfXxIi" + }, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Xl_Q0CJlXxrI" + }, + "source": [ + "" + ] + }, + { + "cell_type": "code", + "metadata": { + "cellView": "code", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FeR0mUspmpwy", + "outputId": "68b7bf6a-9464-4cca-8bbc-e298785f27a1" + }, + "source": [ + "#@title Configure your access\n", + "import httplib2\n", + "\n", + "from apiclient import errors\n", + "from apiclient.discovery import build\n", + "from oauth2client.client import OAuth2WebServerFlow\n", + "\n", + "# Copy your credentials from the console\n", + "CLIENT_ID = '87393635730-0o6ctsjmronvvn0qfh3j9jeomiccpqbj.apps.googleusercontent.com' #@param {type:\"string\"}\n", + "CLIENT_SECRET = 'XQu8YEO3C5b1jV4wo18UDFsH' #@param {type:\"string\"}\n", + "\n", + "# Check https://developers.google.com/webmaster-tools/search-console-api-original/v3/ for all available scopes\n", + "OAUTH_SCOPE = 'https://www.googleapis.com/auth/webmasters.readonly'\n", + "\n", + "# Redirect URI for installed apps\n", + "REDIRECT_URI = 'urn:ietf:wg:oauth:2.0:oob'\n", + "\n", + "# Run through the OAuth flow and retrieve credentials\n", + "flow = OAuth2WebServerFlow(CLIENT_ID, CLIENT_SECRET, OAUTH_SCOPE, REDIRECT_URI)\n", + "authorize_url = flow.step1_get_authorize_url()\n", + "print('Go to the following link in your browser: ' + authorize_url)\n", + "code = input('Enter verification code: ').strip()\n", + "credentials = flow.step2_exchange(code)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "__init__() takes at most 4 positional arguments (5 given)\n" + ], + "name": "stderr" + }, + { + "output_type": "stream", + "text": [ + "Go to the following link in your browser: https://accounts.google.com/o/oauth2/v2/auth?client_id=87393635730-0o6ctsjmronvvn0qfh3j9jeomiccpqbj.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fwebmasters.readonly&access_type=offline&response_type=code\n", + "Enter verification code: 4/1AY0e-g50X8G7CvBhF-95aYSm8G2ikixK_3pgNSQw0xmyrtvEFEG2JLgSuWE\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "cellView": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000, + "referenced_widgets": [ + "68999de7732d48d2891c6246f6801460", + "05ec27ff33674c24a3ca08654b172f91", + "b5b7d1857db04d32b3392017cbb3f038" + ] + }, + "id": "ifRkQVGhv7RK", + "outputId": "dc3e4a25-fdfb-4392-a27d-e532cddf73f9" + }, + "source": [ + "#@title Choose your projects\n", + "#@markdown Generate your selector\n", + "\n", + "# Create an httplib2.Http object and authorize it with our credentials\n", + "http = httplib2.Http()\n", + "http = credentials.authorize(http)\n", + "\n", + "webmasters_service = build('webmasters', 'v3', http=http)\n", + "\n", + "# Retrieve list of properties in account\n", + "site_list = webmasters_service.sites().list().execute()\n", + "\n", + "# Filter for verified websites\n", + "verified_sites_urls = [s['siteUrl'] for s in site_list['siteEntry']\n", + " if s['permissionLevel'] != 'siteUnverifiedUser']\n", + " # and s['siteUrl'][:4] == 'http']\n", + "\n", + "# Print the URLs of all websites you are verified for.\n", + "projects = []\n", + "for site_url in verified_sites_urls:\n", + " print(site_url)\n", + " projects.append(('{}'.format(site_url), site_url))\n", + "\n", + "# print(domain_url)\n", + "# Create Selector\n", + "import ipywidgets as widgets\n", + "output = widgets.Output()\n", + "\n", + "dropdown_purpose = widgets.Dropdown(options = projects, description=\"Site: \")\n", + "\n", + "def dropdown_project_eventhandler(change):\n", + " output.clear_output() \n", + " with output:\n", + " display(projects)\n", + "\n", + "dropdown_purpose.observe(dropdown_project_eventhandler, names='value')\n", + "\n", + "display(dropdown_purpose)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "https://lolsurprise.mgae.com/\n", + "https://babyborn.mgae.com/\n", + "sc-domain:contentaudience.com\n", + "sc-domain:liston.io\n", + "sc-domain:servedontsell.co\n", + "https://extricareusa.com/\n", + "https://lalaloopsy.mgae.com/\n", + "https://www.mgae.com/\n", + "https://whatsinmypursesurprise.mgae.com/\n", + "https://anchoradvisors.com/\n", + "sc-domain:philipmorganconsulting.com\n", + "http://polycor.com/\n", + "https://indiyoung.com/\n", + "https://smithhonig.com/\n", + "https://poopsie.mgae.com/\n", + "sc-domain:polycor.com\n", + "https://www.lolsurprise.com/\n", + "http://www.nfcamenitymanagement.com/\n", + "sc-domain:anchoradvisors.com\n", + "sc-domain:designasylumblog.com\n", + "sc-domain:plumflowersoftware.com\n", + "https://www.2percentcommission.com/\n", + "https://inboundfound.com/\n", + "https://sbstandard.com/\n", + "https://dojobattle.mgae.com/\n", + "sc-domain:littletikes.com\n", + "https://shreddinsharks.mgae.com/\n", + "https://numnoms.mgae.com/\n", + "http://themoriuchigroup.com/\n", + "https://www.servedontsell.co/\n", + "https://hardscapes.polycor.com/\n", + "sc-domain:servedontsell.com\n", + "https://virorides.mgae.com/\n", + "http://philipmorganconsulting.com/\n", + "https://contentaudience.com/\n", + "https://www.crresearch.com/\n", + "https://bratz.mgae.com/\n", + "https://nation.everspaces.com/\n", + "https://servedontsell.com/\n", + "https://glamgoo.mgae.com/\n", + "https://philipmorganconsulting.com/\n", + "http://www.anchoradvisors.com/\n", + "https://bodyunburdened.com/\n", + "https://polycor.com/\n", + "https://nananasurprise.mgae.com/\n", + "https://thehangrees.mgae.com/\n", + "sc-domain:smithhonig.com\n", + "https://straightupcraft.com/\n", + "https://blog.polycor.com/\n", + "sc-domain:mgae.com\n", + "https://www.everspaces.com/\n", + "http://www.jerseyicecreamco.com/\n", + "https://www.polycor.com/\n", + "http://hardscapes.polycor.com/\n", + "sc-domain:metahelm.com\n", + "http://inthedragonslair.blogspot.com/\n", + "http://www.contentaudience.com/\n", + "https://gamerox.mgae.com/\n", + "sc-domain:2percentcommission.com\n", + "http://www.2percentcommission.com/\n", + "https://rainbowsinpieces.mgae.com/\n", + "https://projectmc2.mgae.com/\n", + "https://lovefraud.com/\n", + "https://www.indianalimestonecompany.com/\n", + "sc-domain:sbstandard.com\n", + "https://kingdombuilders.mgae.com/\n", + "sc-domain:inthedragonslair.blogspot.com\n", + "https://fitnesstestdrive.com/\n", + "http://www.littletikes.com/\n", + "https://www.littletikes.com/\n", + "http://www.polycor.com/\n", + "https://www.liston.io/\n", + "sc-domain:incourage.com\n", + "https://www.jerseyicecreamco.com/\n", + "https://wreckroyale.mgae.com/\n" + ], + "name": "stdout" + }, + { + "output_type": "display_data", + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "68999de7732d48d2891c6246f6801460", + "version_minor": 0, + "version_major": 2 + }, + "text/plain": [ + "Dropdown(description='Site: ', options=(('https://lolsurprise.mgae.com/', 'https://lolsurprise.mgae.com/'), ('…" + ] + }, + "metadata": { + "tags": [] + } + } + ] + }, + { + "cell_type": "code", + "metadata": { + "cellView": "code", + "id": "hwBqLKrfwTNw" + }, + "source": [ + "#@title Get your data\n", + "#@markdown Note that if you specify a 30-day period, the script will split the data in two and compare the last 15 days with the first 15.\n", + "import pandas as pd\n", + "from dateutil import rrule\n", + "from datetime import datetime, timedelta, date\n", + "\n", + "url = \"\"\n", + "if(len(dropdown_purpose.value)==0):\n", + " print(\"no Site detected\")\n", + "else:\n", + " url = dropdown_purpose.value\n", + "\n", + "# init\n", + "DF = pd.DataFrame(columns=['page', 'query', 'clicks', 'impressions', 'ctr', 'position', 'date' ])\n", + "## start of get data\n", + "def get_data(startDate, endDate, base= rrule.DAILY):\n", + " '''\n", + " startDate: date string with the following format %Y-%m-%d\n", + " '''\n", + " DF = pd.DataFrame(columns=['page', 'query', 'clicks', 'impressions', 'ctr', 'position', 'date' ])\n", + " request = {\n", + " 'startDate': startDate, \n", + " 'endDate': '2100-12-29', \n", + " 'dimensions': [\"page\", \"query\"],\n", + " 'searchType': \"web\", \n", + " 'responseAggregationType': 'page',\n", + " 'rowLimit': 25000\n", + " }\n", + "\n", + " for dt in rrule.rrule(base, dtstart=datetime.strptime(startDate, '%Y-%m-%d'), until=datetime.strptime(endDate, '%Y-%m-%d'))[1:]:\n", + " request['endDate'] = dt.strftime('%Y-%m-%d')\n", + " response = webmasters_service.searchanalytics().query(siteUrl=url, body=request).execute()\n", + " try:\n", + " dfGSC = pd.DataFrame.from_dict(response['rows'], orient='columns')\n", + " # if rows more than 25000 then go daily\n", + " if dfGSC.shape[0] == 25000:\n", + " #daily for one week\n", + " week_start = request['startDate']\n", + " for dt in rrule.rrule(rrule.DAILY, dtstart=datetime.strptime(request['startDate'], '%Y-%m-%d'), until=datetime.strptime(request['endDate'], '%Y-%m-%d'))[1:]:\n", + " \n", + " request['endDate'] = dt.strftime('%Y-%m-%d')\n", + " response = webmasters_service.searchanalytics().query(siteUrl=url, body=request).execute()\n", + " try:\n", + " dfGSC = pd.DataFrame.from_dict(response['rows'], orient='columns')\n", + " dfGSC[[\"page\", \"query\"]] = pd.DataFrame( dfGSC[\"keys\"].values.tolist() )\n", + " dfGSC = dfGSC.drop(columns=['keys'])\n", + " dfGSC['date'] = week_start\n", + " dfGSC = dfGSC[['page', 'query', 'clicks', 'impressions', 'ctr', 'position', 'date' ]]\n", + " #print(dfGSC.keys()) \n", + " DF = pd.concat([DF, dfGSC], axis=0)\n", + " request['startDate'] = request['endDate']\n", + " except Exception as e:\n", + " print('No Data!')\n", + " break\n", + " continue\n", + "\n", + " dfGSC[[\"page\", \"query\"]] = pd.DataFrame( dfGSC[\"keys\"].values.tolist() )\n", + " dfGSC = dfGSC.drop(columns=['keys'])\n", + " dfGSC['date'] = request['startDate']\n", + " dfGSC = dfGSC[['page', 'query', 'clicks', 'impressions', 'ctr', 'position', 'date' ]]\n", + " #print(dfGSC.keys()) \n", + " DF = pd.concat([DF, dfGSC], axis=0)\n", + " \n", + " except Exception as e:\n", + " print('No Data!')\n", + " break\n", + " request['startDate'] = request['endDate']\n", + " return DF\n", + "\n", + "total_data = get_data('2021-01-01', '2021-06-02')\n", + "total_data.to_csv('gsc-api-daily-www-polycor.csv', index=False)" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/Tutorials/PageRankandCheiRank.ipynb b/Tutorials/PageRankandCheiRank.ipynb index 9ae8e8e..6161f34 100644 --- a/Tutorials/PageRankandCheiRank.ipynb +++ b/Tutorials/PageRankandCheiRank.ipynb @@ -12,7 +12,7 @@ "\n", "* **all_inlinks.csv**: Is an bulk export of `Bulk Export` > `All Inlinks` from Screaming Frog.\n", "\n", - "Both files are raw exports so Column names are the defaults and the read_csv function expects to skip the first row.\n", + "Both files are raw exports so Column names are the defaults and the read_csv function expects to skip the first row for Screaming Frog 12.4 and earlier (eg. skiprows=1) BUT does not expect to skip the first row for ScreamingFrog 12.5 or later (eg. skiprows=0).\n", "\n", "A follow up to this [tweet](https://twitter.com/willem_nout/status/1101417508685467648).\n", "\n", @@ -173,7 +173,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_html = pd.read_csv('internal_html.csv', skiprows=1)\n", + "df_html = pd.read_csv('internal_html.csv', skiprows=0)\n", "\n", "# Grab 200 urls and canonicalize\n", "df_html, mappings = consolidate_urls(df_html)\n", @@ -200,7 +200,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_links = pd.read_csv('all_inlinks.csv', skiprows=1, low_memory=False)\n", + "df_links = pd.read_csv('all_inlinks.csv', skiprows=0, low_memory=False)\n", "\n", "# keep only Ahref and Follow\n", "df_links = df_links[(df_links['Type'] == \"AHREF\") & (df_links['Follow'] == True)]\n",