From 49068841404a374597d362175871eac63a19be5e Mon Sep 17 00:00:00 2001
From: Gaurav Sheni <gvsheni@gmail.com>
Date: Thu, 24 Jul 2025 17:34:41 -0400
Subject: [PATCH 1/8] updates

---
 README.md              | 75 ++++++++++++++++++++++++++++++++++++++++++
 config.yaml            |  1 -
 pymetrics/__main__.py  |  7 ++--
 pymetrics/metrics.py   |  3 +-
 pymetrics/pypi.py      |  8 +++--
 pymetrics/summarize.py |  4 +++
 6 files changed, 90 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 0a55d0a..db91652 100644
--- a/README.md
+++ b/README.md
@@ -48,6 +48,31 @@ Currently, the download data is collected from the following distributions:
 In the future, we may expand the source distributions to include:
 * [GitHub Releases](https://github.com/): Information about the project downloads from GitHub releases.
 
+# Install
+Install pymetrics using pip (or uv):
+```shell
+pip install git+ssh://git@github.com/sdv-dev/pymetrics
+```
+
+## Local Usage
+Collect metrics from PyPI by running `gitmetrics` on your computer. You need to provide the following:
+
+1. BigQuery Credentials. In order to get PyPI download data, you need to execute queries on Google BigQuery.
+  Therefore, you will need an authentication JSON file, which must be provided to you by a privileged admin.
+  Once you have this JSON file, export the contents of the credentials file into a
+  `BIGQUERY_CREDENTIALS` environment variable.
+2. A set of Google Drive Credentials need to be provided in the format required by `PyDrive`. The
+   credentials must be passed via the `PYDRIVE_CREDENTIALS` environment variable.
+   - See [instructions from PyDrive](https://pythonhosted.org/PyDrive/quickstart.html).
+3. A list of PyPI projects for which to collect the download metrics, defined in a YAML file.
+   See [config.yaml](./config.yaml) for an example.
+
+You can run pymetrics with the following CLI command:
+
+```shell
+pymetrics collect-pypi --max-days 30 --add-metrics --output-folder {OUTPUT_FOLDER}
+```
+
 ## Workflows
 
 ### Daily Collection
@@ -77,5 +102,55 @@ Installing the main SDV library also installs all the other libraries as depende
 
 This methodology prevents double-counting downloads while providing an accurate representation of SDV usage.
 
+## PyPI Data
+PyMetrics collects download information from PyPI by querying the [public PyPI download statistics dataset on BigQuery](https://console.cloud.google.com/bigquery?p=bigquery-public-data&d=pypi&page=dataset). The following data fields are captured for each download event:
+
+**Temporal & Geographic Data:**
+* `timestamp`: The timestamp at which the download happened
+* `country_code`: The 2-letter country code
+
+**Package Information:**
+* `project`: The name of the PyPI project (library) that is being downloaded
+* `version`: The downloaded version
+* `type`: The type of file that was downloaded (source or wheel)
+
+**Installation Environment:**
+* `installer_name`: The installer used for the download, like `pip` or `bandersnatch` or `uv`
+* `implementation_name`: The name of the Python implementation, such as `cpython`
+* `implementation_version`: The Python version
+* `ci`: A boolean flag indicating whether the download originated from a CI system (True, False, or null). This is determined by checking for specific environment variables set by CI platforms such as Azure Pipelines (`BUILD_BUILDID`), Jenkins (`BUILD_ID`), or general CI indicators (`CI`, `PIP_IS_CI`)
+
+**System Information:**
+* `distro_name`: Name of the Linux or Mac distribution (empty if Windows)
+* `distro_version`: Distribution version (empty for Windows)
+* `system_name`: Type of OS, like Linux, Darwin (for Mac), or Windows
+* `system_release`: OS version in case of Windows, kernel version in case of Unix
+* `cpu`: CPU architecture used
+
+## Aggregation Metrics
+
+If the `--add-metrics` option is passed to `pymetrics`, a spreadsheet with aggregation
+metrics will be created alongside the raw PyPI downloads CSV file for each individual project.
+
+The aggregation metrics spreasheets contain the following tabs:
+
+* **By Month:** Number of downloads per month and increase in the number of downloads from month to month.
+* **By Version:** Absolute and relative number of downloads per version.
+* **By Country Code:** Absolute and relative number of downloads per Country.
+* **By Python Version:** Absolute and relative number of downloads per minor Python Version (X.Y, like 3.8).
+* **By Full Python Version:** Absolute and relative number of downloads per Python Version, including
+  the patch number (X.Y.Z, like 3.8.1).
+* **By Installer Name:** Absolute and relative number of downloads per Installer (e.g. pip)
+* **By Distro Name:** Absolute and relative number of downloads per Distribution Name (e.g. Ubuntu)
+* **By Distro Name:** Absolute and relative number of downloads per Distribution Name AND Version (e.g. Ubuntu 20.04)
+* **By Distro Kernel:** Absolute and relative number of downloads per Distribution Name, Version AND Kernel (e.g. Ubuntu 18.04 - 5.4.104+)
+* **By OS Type:** Absolute and relative number of downloads per OS Type (e.g. Linux)
+* **By Cpu:** Absolute and relative number of downloads per CPU Version (e.g. AMD64)
+* **By CI**: Absolute and relative number of downloads by CI status (automated vs. manual installations)
+* **By Month and Version:** Absolute number of downloads per month and version.
+* **By Month and Python Version:** Absolute number of downloads per month and Python version.
+* **By Month and Country Code:** Absolute number of downloads per month and country.
+* **By Month and Installer Name:** Absolute number of downloads per month and Installer.
+
 ## Known Issues
 1. The conda package download data for Anaconda does not match the download count shown on the website. This is due to missing download data in the conda package download data. See this: https://github.com/anaconda/anaconda-package-data/issues/45
diff --git a/config.yaml b/config.yaml
index 0143d52..791c76f 100644
--- a/config.yaml
+++ b/config.yaml
@@ -1,4 +1,3 @@
-max-days: 7
 projects:
   - sdv
   - ctgan
diff --git a/pymetrics/__main__.py b/pymetrics/__main__.py
index 6679102..3b28406 100644
--- a/pymetrics/__main__.py
+++ b/pymetrics/__main__.py
@@ -49,7 +49,7 @@ def _collect_pypi(args):
     config = _load_config(args.config_file)
     projects = args.projects or config['projects']
     output_folder = args.output_folder
-    max_days = args.max_days or config.get('max-days')
+    max_days = args.max_days
 
     collect_pypi_downloads(
         projects=projects,
@@ -175,7 +175,8 @@ def _get_parser():
         '--max-days',
         type=int,
         required=False,
-        help='Max days of data to pull if start-date is not given.',
+        default=30,
+        help='Max days of data to pull if start-date is not given. Default to last 30 days.',
     )
     collect_pypi.add_argument(
         '-f',
@@ -241,7 +242,7 @@ def _get_parser():
         type=int,
         required=False,
         default=90,
-        help='Max days of data to pull.',
+        help='Max days of data to pull. Default to last 90 days.',
     )
     return parser
 
diff --git a/pymetrics/metrics.py b/pymetrics/metrics.py
index f98a595..d6bfbcd 100644
--- a/pymetrics/metrics.py
+++ b/pymetrics/metrics.py
@@ -11,7 +11,7 @@
 
 
 def _groupby(downloads, groupby, index_name=None, percent=True):
-    grouped = downloads.groupby(groupby).size().reset_index()
+    grouped = downloads.groupby(groupby, dropna=False).size().reset_index()
     grouped.columns = [index_name or groupby, 'downloads']
     if percent:
         grouped['percent'] = (grouped.downloads * 100 / grouped.downloads.sum()).round(3)
@@ -78,6 +78,7 @@ def _get_sheet_name(column):
     'distro_kernel',
     'OS_type',
     'cpu',
+    'ci',
 ]
 SORT_BY_DOWNLOADS = [
     'country_code',
diff --git a/pymetrics/pypi.py b/pymetrics/pypi.py
index fbe9439..84d74d7 100644
--- a/pymetrics/pypi.py
+++ b/pymetrics/pypi.py
@@ -25,6 +25,7 @@
     details.system.name             as system_name,
     details.system.release          as system_release,
     details.cpu                     as cpu,
+    details.ci                      as ci,
 FROM `bigquery-public-data.pypi.file_downloads`
 WHERE file.project in {projects}
     AND timestamp > '{start_date}'
@@ -44,6 +45,7 @@
     'system_name',
     'system_release',
     'cpu',
+    'ci',
 ]
 
 
@@ -129,9 +131,9 @@ def get_pypi_downloads(
     if previous is not None:
         if isinstance(projects, str):
             projects = (projects,)
-        previous_projects = previous[previous.project.isin(projects)]
-        min_date = previous_projects.timestamp.min().date()
-        max_date = previous_projects.timestamp.max().date()
+        previous_projects = previous[previous['project'].isin(projects)]
+        min_date = previous_projects['timestamp'].min().date()
+        max_date = previous_projects['timestamp'].max().date()
     else:
         previous = pd.DataFrame(columns=OUTPUT_COLUMNS)
         min_date = None
diff --git a/pymetrics/summarize.py b/pymetrics/summarize.py
index 477ef07..65244bf 100644
--- a/pymetrics/summarize.py
+++ b/pymetrics/summarize.py
@@ -140,6 +140,7 @@ def get_previous_pypi_downloads(output_folder, dry_run=False):
             'system_name': pd.CategoricalDtype(),
             'system_release': pd.CategoricalDtype(),
             'cpu': pd.CategoricalDtype(),
+            'ci': pd.BooleanDtype(),
         },
     }
     if dry_run:
@@ -148,6 +149,9 @@ def get_previous_pypi_downloads(output_folder, dry_run=False):
     LOGGER.info('Parsing version column to Version class objects')
     if 'version' in data.columns:
         data['version'] = data['version'].apply(parse)
+    if 'PROJECT' in data.columns:
+        data = data.rename(columns={'PROJECT': 'project'})
+        data['project'] = data['project'].astype('category')
     return data
 
 

From b41438c4499296ef1b8a6930719c0b10e8bc0e33 Mon Sep 17 00:00:00 2001
From: Gaurav Sheni <gvsheni@gmail.com>
Date: Thu, 24 Jul 2025 17:35:24 -0400
Subject: [PATCH 2/8] updates

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index db91652..3edbd5c 100644
--- a/README.md
+++ b/README.md
@@ -55,7 +55,7 @@ pip install git+ssh://git@github.com/sdv-dev/pymetrics
 ```
 
 ## Local Usage
-Collect metrics from PyPI by running `gitmetrics` on your computer. You need to provide the following:
+Collect metrics from PyPI by running `pymetrics` on your computer. You need to provide the following:
 
 1. BigQuery Credentials. In order to get PyPI download data, you need to execute queries on Google BigQuery.
   Therefore, you will need an authentication JSON file, which must be provided to you by a privileged admin.

From 2b6d8468e32767126ade1fcb0eb465686714ed57 Mon Sep 17 00:00:00 2001
From: Gaurav Sheni <gvsheni@gmail.com>
Date: Thu, 24 Jul 2025 17:37:02 -0400
Subject: [PATCH 3/8] updates

---
 README.md | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 3edbd5c..bb3b7f6 100644
--- a/README.md
+++ b/README.md
@@ -76,15 +76,7 @@ pymetrics collect-pypi --max-days 30 --add-metrics --output-folder {OUTPUT_FOLDE
 ## Workflows
 
 ### Daily Collection
-On a daily basis, this workflow collects download data from PyPI and Anaconda. The data is then published in CSV format (`pypi.csv`). In addition, it computes metrics for the PyPI downloads (see below).
-
-#### Metrics
-This PyPI download metrics are computed along several dimensions:
-
-- **By Month**: The number of downloads per month.
-- **By Version**: The number of downloads per version of the software, as determined by the software maintainers.
-- **By Python Version**: The number of downloads per minor Python version (eg. 3.8).
-- **And more!**
+On a daily basis, this workflow collects download data from PyPI and Anaconda. The data is then published in CSV format (`pypi.csv`). In addition, it computes metrics for the PyPI downloads (see [#Aggregation Metrics](#aggregation-metrics))
 
 ### Daily Summarize
 

From 169153b05ad613cc04406988df2dbc987522f0e3 Mon Sep 17 00:00:00 2001
From: Gaurav Sheni <gvsheni@gmail.com>
Date: Thu, 24 Jul 2025 17:38:16 -0400
Subject: [PATCH 4/8] fix

---
 pymetrics/__main__.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pymetrics/__main__.py b/pymetrics/__main__.py
index 3b28406..8b925c4 100644
--- a/pymetrics/__main__.py
+++ b/pymetrics/__main__.py
@@ -175,8 +175,7 @@ def _get_parser():
         '--max-days',
         type=int,
         required=False,
-        default=30,
-        help='Max days of data to pull if start-date is not given. Default to last 30 days.',
+        help='Max days of data to pull if start-date is not given',
     )
     collect_pypi.add_argument(
         '-f',

From 62291f6f859033c98e5b21f903b006461c9bd445 Mon Sep 17 00:00:00 2001
From: Gaurav Sheni <gvsheni@gmail.com>
Date: Thu, 24 Jul 2025 17:44:47 -0400
Subject: [PATCH 5/8] fix

---
 pymetrics/summarize.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pymetrics/summarize.py b/pymetrics/summarize.py
index 65244bf..1cc4310 100644
--- a/pymetrics/summarize.py
+++ b/pymetrics/summarize.py
@@ -149,9 +149,6 @@ def get_previous_pypi_downloads(output_folder, dry_run=False):
     LOGGER.info('Parsing version column to Version class objects')
     if 'version' in data.columns:
         data['version'] = data['version'].apply(parse)
-    if 'PROJECT' in data.columns:
-        data = data.rename(columns={'PROJECT': 'project'})
-        data['project'] = data['project'].astype('category')
     return data
 
 

From 98503ca5a513f942ac6a32f57796969d0c440638 Mon Sep 17 00:00:00 2001
From: Gaurav Sheni <gvsheni@gmail.com>
Date: Fri, 25 Jul 2025 12:50:48 -0400
Subject: [PATCH 6/8] fix parse version

---
 ...ummarize.yaml => daily_summarization.yaml} |  2 +-
 README.md                                     |  2 +-
 pymetrics/metrics.py                          | 62 +++++++++----------
 pymetrics/output.py                           |  8 +--
 pymetrics/summarize.py                        |  3 +-
 tests/unit/test_metrics.py                    | 54 ++++++++++++++++
 6 files changed, 91 insertions(+), 40 deletions(-)
 rename .github/workflows/{daily_summarize.yaml => daily_summarization.yaml} (98%)
 create mode 100644 tests/unit/test_metrics.py

diff --git a/.github/workflows/daily_summarize.yaml b/.github/workflows/daily_summarization.yaml
similarity index 98%
rename from .github/workflows/daily_summarize.yaml
rename to .github/workflows/daily_summarization.yaml
index 9bbd938..04350c3 100644
--- a/.github/workflows/daily_summarize.yaml
+++ b/.github/workflows/daily_summarization.yaml
@@ -1,4 +1,4 @@
-name: Daily Summarize
+name: Daily Summarization
 
 on:
   workflow_dispatch:
diff --git a/README.md b/README.md
index bb3b7f6..0dd3f00 100644
--- a/README.md
+++ b/README.md
@@ -78,7 +78,7 @@ pymetrics collect-pypi --max-days 30 --add-metrics --output-folder {OUTPUT_FOLDE
 ### Daily Collection
 On a daily basis, this workflow collects download data from PyPI and Anaconda. The data is then published in CSV format (`pypi.csv`). In addition, it computes metrics for the PyPI downloads (see [#Aggregation Metrics](#aggregation-metrics))
 
-### Daily Summarize
+### Daily Summarization
 
 On a daily basis, this workflow summarizes the PyPI download data from `pypi.csv` and calculates downloads for libraries. The summarized data is published to a GitHub repo:
 - [Downloads_Summary.xlsx](https://github.com/sdv-dev/sdv-dev.github.io/blob/gatsby-home/assets/Downloads_Summary.xlsx)
diff --git a/pymetrics/metrics.py b/pymetrics/metrics.py
index d6bfbcd..154dcef 100644
--- a/pymetrics/metrics.py
+++ b/pymetrics/metrics.py
@@ -1,9 +1,10 @@
 """Functions to compute aggregation metrics over raw downloads."""
 
 import logging
-import re
 
+import numpy as np
 import pandas as pd
+from packaging.version import InvalidVersion, Version
 
 from pymetrics.output import create_spreadsheet
 
@@ -105,34 +106,6 @@ def _get_sheet_name(column):
 ]
 
 
-RE_NUMERIC = re.compile(r'^\d+')
-
-
-def _version_element_order_key(version):
-    components = []
-    last_component = None
-    last_numeric = None
-    for component in version.split('.', 2):
-        if RE_NUMERIC.match(component):
-            try:
-                numeric = RE_NUMERIC.match(component).group(0)
-                components.append(int(numeric))
-                last_component = component
-                last_numeric = numeric
-            except AttributeError:
-                # From time to time this errors out in github actions
-                # while it shouldn't enter the `if`.
-                pass
-
-    components.append(last_component[len(last_numeric) :])
-
-    return components
-
-
-def _version_order_key(version_column):
-    return version_column.apply(_version_element_order_key)
-
-
 def _mangle_columns(downloads):
     downloads = downloads.rename(columns=RENAME_COLUMNS)
     for col in [
@@ -154,6 +127,32 @@ def _mangle_columns(downloads):
     return downloads
 
 
+def _safe_version_parse(version_str):
+    if pd.isna(version_str):
+        return np.nan
+
+    try:
+        version = Version(str(version_str))
+    except InvalidVersion:
+        cleaned = str(version_str).rstrip('+~')
+        try:
+            version = Version(cleaned)
+        except (InvalidVersion, TypeError):
+            LOGGER.info(f'Unable to parse version: {version_str}')
+            version = np.nan
+
+    return version
+
+
+def _version_order_key(version_column):
+    return version_column.apply(_safe_version_parse)
+
+
+def _sort_by_version(data, column, ascending=False):
+    data = data.sort_values(by=column, key=_version_order_key, ascending=ascending)
+    return data
+
+
 def compute_metrics(downloads, output_path=None):
     """Compute aggregation metrics over the given downloads.
 
@@ -172,8 +171,7 @@ def compute_metrics(downloads, output_path=None):
         if column in SORT_BY_DOWNLOADS:
             sheet = sheet.sort_values('downloads', ascending=False)
         elif column in SORT_BY_VERSION:
-            sheet = sheet.sort_values(column, ascending=False, key=_version_order_key)
-
+            sheet = _sort_by_version(sheet, column=column, ascending=False)
         sheets[name] = sheet
 
     for column in HISTORICAL_COLUMNS:
@@ -182,7 +180,7 @@ def compute_metrics(downloads, output_path=None):
         sheets[name] = _historical_groupby(downloads, [column])
 
     if output_path:
-        create_spreadsheet(output_path, sheets)
+        create_spreadsheet(output_path, sheets, na_rep='<NaN>')
         return None
 
     return sheets
diff --git a/pymetrics/output.py b/pymetrics/output.py
index 0125ac6..1fa77c9 100644
--- a/pymetrics/output.py
+++ b/pymetrics/output.py
@@ -34,8 +34,8 @@ def get_path(folder, filename):
     return str(pathlib.Path(folder) / filename)
 
 
-def _add_sheet(writer, data, sheet_name):
-    data.to_excel(writer, sheet_name=sheet_name, index=False, engine='xlsxwriter')
+def _add_sheet(writer, data, sheet_name, na_rep=''):
+    data.to_excel(writer, sheet_name=sheet_name, index=False, engine='xlsxwriter', na_rep=na_rep)
 
     for column in data:
         column_length = None
@@ -51,7 +51,7 @@ def _add_sheet(writer, data, sheet_name):
         )
 
 
-def create_spreadsheet(output_path, sheets):
+def create_spreadsheet(output_path, sheets, na_rep=''):
     """Create a spreadsheet with the indicated name and data.
 
     If the ``output_path`` variable starts with ``gdrive://`` it is interpreted
@@ -74,7 +74,7 @@ def create_spreadsheet(output_path, sheets):
 
     with pd.ExcelWriter(output, engine='xlsxwriter') as writer:  # pylint: disable=E0110
         for title, data in sheets.items():
-            _add_sheet(writer, data, title)
+            _add_sheet(writer, data, title, na_rep=na_rep)
 
     if drive.is_drive_path(output_path):
         LOGGER.info('Creating file %s', output_path)
diff --git a/pymetrics/summarize.py b/pymetrics/summarize.py
index 1cc4310..65efc2e 100644
--- a/pymetrics/summarize.py
+++ b/pymetrics/summarize.py
@@ -147,8 +147,7 @@ def get_previous_pypi_downloads(output_folder, dry_run=False):
         read_csv_kwargs['nrows'] = 10_000
     data = load_csv(csv_path, read_csv_kwargs=read_csv_kwargs)
     LOGGER.info('Parsing version column to Version class objects')
-    if 'version' in data.columns:
-        data['version'] = data['version'].apply(parse)
+    data['version'] = data['version'].apply(parse)
     return data
 
 
diff --git a/tests/unit/test_metrics.py b/tests/unit/test_metrics.py
new file mode 100644
index 0000000..f54ff07
--- /dev/null
+++ b/tests/unit/test_metrics.py
@@ -0,0 +1,54 @@
+import numpy as np
+import pandas as pd
+
+from pymetrics.metrics import _sort_by_version
+
+
+def test__sort_by_version():
+    # Setup
+    data = pd.DataFrame({
+        'version': pd.Series(
+            ['1.9.0', '1.9.0.dev0', '1.24.1', '0.9.1', '0.16.0', '0.0.0'], dtype='object'
+        ),
+        'name': ['v5', 'v4', 'v6', 'v2', 'v3', 'v1'],
+    })
+
+    # Run
+    sorted_df = _sort_by_version(data, 'version', ascending=False)
+
+    # Assert
+    expected_versions = ['1.24.1', '1.9.0', '1.9.0.dev0', '0.16.0', '0.9.1', '0.0.0']
+    assert sorted_df['version'].map(str).tolist() == expected_versions
+    assert sorted_df['name'].tolist() == ['v6', 'v5', 'v4', 'v3', 'v2', 'v1']
+
+
+def test__sort_by_version_with_invalid_versions():
+    # Setup
+    data = pd.DataFrame({
+        'version': pd.Series(['2.7.11+', '2.0.0', 'invalid', '3.0', np.nan], dtype='object'),
+        'name': ['v4', 'v3', 'v2', 'v5', 'v1'],
+    })
+
+    # Run
+    sorted_df = _sort_by_version(data, 'version')
+
+    # Assert
+    expected_versions = ['3.0', '2.7.11+', '2.0.0', 'invalid', np.nan]
+    assert sorted_df['version'].tolist() == expected_versions
+    assert sorted_df['name'].tolist() == ['v5', 'v4', 'v3', 'v2', 'v1']
+
+
+def test__sort_by_version_with_mixed_version_formats():
+    # Setup
+    data = pd.DataFrame({
+        'version': ['1.0a1', '1.0b2', '1.0rc3', '1.0', '1.0.post0'],
+        'name': ['alpha', 'beta', 'rc', 'stable', 'post'],
+    })
+
+    # Run
+    sorted_df = _sort_by_version(data, 'version', ascending=False)
+
+    # Assert
+    expected_versions = ['1.0.post0', '1.0', '1.0rc3', '1.0b2', '1.0a1']
+    assert sorted_df['version'].tolist() == expected_versions
+    assert sorted_df['name'].tolist() == ['post', 'stable', 'rc', 'beta', 'alpha']

From d4f6f79cb49b4b18362b8afee74df72ffd563e04 Mon Sep 17 00:00:00 2001
From: Gaurav Sheni <gvsheni@gmail.com>
Date: Fri, 25 Jul 2025 13:04:19 -0400
Subject: [PATCH 7/8] fix readme

---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 0dd3f00..6282187 100644
--- a/README.md
+++ b/README.md
@@ -61,11 +61,11 @@ Collect metrics from PyPI by running `pymetrics` on your computer. You need to p
   Therefore, you will need an authentication JSON file, which must be provided to you by a privileged admin.
   Once you have this JSON file, export the contents of the credentials file into a
   `BIGQUERY_CREDENTIALS` environment variable.
-2. A set of Google Drive Credentials need to be provided in the format required by `PyDrive`. The
-   credentials must be passed via the `PYDRIVE_CREDENTIALS` environment variable.
-   - See [instructions from PyDrive](https://pythonhosted.org/PyDrive/quickstart.html).
-3. A list of PyPI projects for which to collect the download metrics, defined in a YAML file.
+2. A list of PyPI projects for which to collect the download metrics, defined in a YAML file.
    See [config.yaml](./config.yaml) for an example.
+3. Optional. A set of Google Drive Credentials can be provided in the format required by `PyDrive`. The
+   credentials can be passed via the `PYDRIVE_CREDENTIALS` environment variable.
+   - See [instructions from PyDrive](https://pythonhosted.org/PyDrive/quickstart.html).
 
 You can run pymetrics with the following CLI command:
 

From 901dc804321983035fa129ee77c7aae7bce74de9 Mon Sep 17 00:00:00 2001
From: Gaurav Sheni <gvsheni@gmail.com>
Date: Fri, 25 Jul 2025 13:20:36 -0400
Subject: [PATCH 8/8] update filename

---
 pymetrics/drive.py  | 2 +-
 pymetrics/output.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pymetrics/drive.py b/pymetrics/drive.py
index b29362c..0b99a59 100644
--- a/pymetrics/drive.py
+++ b/pymetrics/drive.py
@@ -97,7 +97,7 @@ def upload(content, filename, folder, convert=False):
 
     drive_file.content = content
     drive_file.Upload({'convert': convert})
-    LOGGER.info('Uploaded file %s', drive_file.metadata['alternateLink'])
+    LOGGER.info(f'Uploaded filename {filename}')
 
 
 def download(folder, filename, xlsx=False):
diff --git a/pymetrics/output.py b/pymetrics/output.py
index 1fa77c9..2c4c5e9 100644
--- a/pymetrics/output.py
+++ b/pymetrics/output.py
@@ -77,8 +77,8 @@ def create_spreadsheet(output_path, sheets, na_rep=''):
             _add_sheet(writer, data, title, na_rep=na_rep)
 
     if drive.is_drive_path(output_path):
-        LOGGER.info('Creating file %s', output_path)
         folder, filename = drive.split_drive_path(output_path)
+        LOGGER.info(f'Creating filename {filename}')
         drive.upload(output, filename, folder, convert=True)
     else:
         if not output_path.endswith('.xlsx'):