From 304d5e358aac46d7fdc167aa37d420e7c45e5826 Mon Sep 17 00:00:00 2001 From: jairomelo Date: Fri, 11 Jul 2025 16:35:31 -0700 Subject: [PATCH 1/5] feat: add CI workflow and update README with CI badge; enhance dev dependencies in pyproject.toml --- .github/workflows/ci.yml | 27 +++++++++++++++++++++++++++ README.md | 3 ++- pyproject.toml | 10 +++++++++- 3 files changed, 38 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..49b7db3 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,27 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + test: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e .[dev] + - name: Run tests + env: + GEONAMES_USERNAME: ${{ secrets.GEONAMES_USERNAME }} + run: | + pytest -v --maxfail=3 diff --git a/README.md b/README.md index 27a7d0e..3ed6d1e 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,13 @@ ![PyPI - Version](https://img.shields.io/pypi/v/georesolver) ![Python Versions](https://img.shields.io/pypi/pyversions/georesolver) +![CI](https://github.com/jairomelo/GeoResolver/actions/workflows/ci.yml/badge.svg) ![License](https://img.shields.io/pypi/l/georesolver) ![Downloads](https://static.pepy.tech/badge/georesolver) [![Documentation](https://img.shields.io/badge/docs-online-blue)](https://jairomelo.com/Georesolver/) [![Issues](https://img.shields.io/github/issues/jairomelo/Georesolver)](https://github.com/jairomelo/Georesolver/issues) -# Georesolver +# GeoResolver GeoResolver is a lightweight Python library for resolving place names into geographic coordinates and related metadata using multiple gazetteer services, including [GeoNames](https://www.geonames.org/), [WHG](https://whgazetteer.org/), [Wikidata](https://www.wikidata.org/wiki/Wikidata:Main_Page), and [TGN](https://www.getty.edu/research/tools/vocabularies/tgn/). diff --git a/pyproject.toml b/pyproject.toml index d3eaf08..4608845 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,4 +54,12 @@ Issues = "https://github.com/jairomelo/Georesolver/issues" Documentation = "https://jairomelo.com/Georesolver/" [tool.setuptools.package-data] -"georesolver" = ["data/mappings/places_map.json"] \ No newline at end of file +"georesolver" = ["data/mappings/places_map.json"] + +[project.optional-dependencies] +dev = [ + "pytest>=8.0.0", + "pytest-cov", + "mypy", + "ruff" +] From b0b3b453076b78de9a9dcc998cc415077f147b3d Mon Sep 17 00:00:00 2001 From: jairomelo Date: Mon, 14 Jul 2025 19:39:27 -0400 Subject: [PATCH 2/5] feat: process only unique values in the Series. Significantly improves performance by reducing requests. --- .gitignore | 2 + src/georesolver/resolver.py | 138 ++++++++++++++++++++++++++++++------ tests/test_batch.py | 37 +++++++++- 3 files changed, 156 insertions(+), 21 deletions(-) diff --git a/.gitignore b/.gitignore index 8a64f4c..2ccbf6c 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,5 @@ build/ dist/ *.egg-info/ *.sh + +tests/data/ \ No newline at end of file diff --git a/src/georesolver/resolver.py b/src/georesolver/resolver.py index 4bc6187..2c06f46 100644 --- a/src/georesolver/resolver.py +++ b/src/georesolver/resolver.py @@ -917,9 +917,12 @@ def resolve(self, place_name = place_name.strip() - if pycountry.countries.get(alpha_2=country_code) is None and country_code is not None: - self.logger.warning(f"Invalid country code: {country_code}\nLook at the correct ISO 3166-1 alpha-2 country codes at https://www.iso.org/iso-3166-country-codes.html") - country_code = None + try: + if pycountry.countries.get(alpha_2=country_code) is None and country_code is not None: + self.logger.warning(f"Invalid country code: {country_code}\nLook at the correct ISO 3166-1 alpha-2 country codes at https://www.iso.org/iso-3166-country-codes.html") + country_code = None + except Exception as e: + self.logger.info(f"Error occurred while validating country code: {e}") if self.flexible_threshold and len(place_name) < 5: self.logger.warning( @@ -972,12 +975,16 @@ def resolve_batch( ) -> Union[pd.DataFrame, List[dict]]: """ Resolve coordinates for a batch of places from a DataFrame. + + This method optimizes API calls by processing only unique combinations of + place_name, country_code, and place_type, then mapping results back to the original DataFrame. Args: df (pd.DataFrame): Input DataFrame with place names and optional country/type columns. place_column (str): Column name for place names. country_column (str): Column name for country codes (optional). place_type_column (str): Column name for place types (optional). + use_default_filter (bool): If True, apply a default filter as fallback. return_df (bool): If True, return a DataFrame with separate columns for each attribute. Otherwise, return a list of dictionaries. show_progress (bool): If True, show a progress bar during processing. @@ -988,11 +995,6 @@ def resolve_batch( pd.DataFrame: A DataFrame with resolved coordinates and metadata. List[dict]: A list of dictionaries with resolved coordinates and metadata if return_df is False. """ - #TODO: - # - Gently handle NaN and empty strings in place_column - # - Process data in chunks of 100 rows - # - Only process records with valid place names (non-empty strings) - # - Sort Series if not isinstance(df, pd.DataFrame): raise ValueError("Input must be a pandas DataFrame") @@ -1006,27 +1008,123 @@ def resolve_batch( if place_type_column and place_type_column not in df.columns: raise ValueError(f"Column '{place_type_column}' not found in DataFrame") + # Create a copy of the input DataFrame to avoid modifying the original + df_copy = df.copy() + + # Handle NaN and empty values in place_column + df_copy[place_column] = df_copy[place_column].fillna("").astype(str) + + # Filter out rows with empty place names + valid_mask = df_copy[place_column].str.strip() != "" + df_valid = df_copy[valid_mask].copy() + + if df_valid.empty: + self.logger.warning("No valid place names found in the DataFrame") + if return_df: + # Return empty results DataFrame with proper structure + empty_results = pd.DataFrame({ + "place": None, "standardize_label": None, "language": None, + "latitude": None, "longitude": None, "source": None, + "id": None, "uri": None, "country_code": None, + "part_of": None, "part_of_uri": None, "confidence": None, + "threshold": None, "match_type": None + }, index=df.index) + return empty_results + else: + # Return list of None values, properly typed for the Union return type + return [None] * len(df) # type: ignore + + # Create unique combinations for processing + lookup_columns = [place_column] + if country_column: + df_valid[country_column] = df_valid[country_column].fillna("") + lookup_columns.append(country_column) + if place_type_column: + df_valid[place_type_column] = df_valid[place_type_column].fillna("") + lookup_columns.append(place_type_column) + + # Get unique combinations + unique_combinations = df_valid[lookup_columns].drop_duplicates().reset_index(drop=True) + + # Log optimization info + original_count = len(df_valid) + unique_count = len(unique_combinations) + reduction_pct = ((original_count - unique_count) / original_count * 100) if original_count > 0 else 0 + self.logger.info(f"Processing {unique_count} unique combinations instead of {original_count} rows " + f"({reduction_pct:.1f}% reduction in API calls)") + + # Process unique combinations if show_progress: - df_iter = tqdm(df.iterrows(), total=len(df)) + unique_iter = tqdm(unique_combinations.iterrows(), + total=len(unique_combinations), + desc="Resolving unique places") else: - df_iter = df.iterrows() + unique_iter = unique_combinations.iterrows() - results = [] - for _, row in df_iter: - place_name = row.get(place_column, "") - country_code = row.get(country_column) if country_column else None - place_type = row.get(place_type_column) if place_type_column else None - - coords = self.resolve( + # Store results for unique combinations + unique_results = {} + + for _, row in unique_iter: + place_name = row[place_column].strip() + country_code = row.get(country_column, None) if country_column else None + place_type = row.get(place_type_column, None) if place_type_column else None + + # Convert empty strings to None for consistency + country_code = country_code if country_code and country_code.strip() else None + place_type = place_type if place_type and place_type.strip() else None + + # Create a key for the combination + key = (place_name, country_code or "", place_type or "") + + result = self.resolve( place_name=place_name, country_code=country_code, place_type=place_type, use_default_filter=use_default_filter ) - - results.append(coords) + + unique_results[key] = result + + # Map results back to original DataFrame + results = [] + for idx in df.index: + if idx in df_valid.index: + row = df_valid.loc[idx] + place_name = row[place_column].strip() + country_code = row.get(country_column, None) if country_column else None + place_type = row.get(place_type_column, None) if place_type_column else None + + # Convert empty strings to None for key matching + country_code = country_code if country_code and country_code.strip() else None + place_type = place_type if place_type and place_type.strip() else None + + key = (place_name, country_code or "", place_type or "") + result = unique_results.get(key) + else: + # For rows with invalid place names, return None + result = None + + results.append(result) if return_df: - return pd.DataFrame(results, columns=["place", "standardize_label", "language", "latitude", "longitude", "source", "place_id", "place_uri", "country_code", "part_of", "part_of_uri", "confidence", "threshold", "match_type"], index=df.index) + # Fill None results with a default structure before creating DataFrame + default_result = { + "place": None, "standardize_label": None, "language": None, + "latitude": None, "longitude": None, "source": None, + "id": None, "uri": None, "country_code": None, + "part_of": None, "part_of_uri": None, "confidence": None, + "threshold": None, "match_type": None + } + + # Expand dictionary results into separate columns + expanded_results = [] + for result in results: + if result is None: + expanded_results.append(default_result) + else: + expanded_results.append(result) + + results_df = pd.DataFrame(expanded_results, index=df.index) + return results_df else: return results \ No newline at end of file diff --git a/tests/test_batch.py b/tests/test_batch.py index 616eed3..a9cc75b 100644 --- a/tests/test_batch.py +++ b/tests/test_batch.py @@ -113,4 +113,39 @@ def test_batch_resolver_list(): assert 'latitude' in result, "Result should contain latitude" assert 'longitude' in result, "Result should contain longitude" assert isinstance(result['latitude'], (int, float)), "Latitude should be numeric" - assert isinstance(result['longitude'], (int, float)), "Longitude should be numeric" \ No newline at end of file + assert isinstance(result['longitude'], (int, float)), "Longitude should be numeric" + + +def test_batch_real_df(csv_path="tests/data/bautismos_cleaned.csv"): + df = pd.read_csv(csv_path) + resolver = PlaceResolver([GeoNamesQuery(), WHGQuery()], + verbose=True, lang="es") + results_df = resolver.resolve_batch(df, + place_column="Descriptor Geográfico 2", + show_progress=True) + print(f"\n=== Real DataFrame Results ===") + print("Results DataFrame:") + print(results_df.head()) + + assert isinstance(results_df, pd.DataFrame), "Results should be a DataFrame" + assert 'latitude' in results_df.columns, "Results DataFrame should contain latitude" + assert 'longitude' in results_df.columns, "Results DataFrame should contain longitude" + + # Check that we have at least some successful results + successful_results = results_df.dropna(subset=['latitude', 'longitude']) + assert len(successful_results) > 0, "Should have at least some successful coordinate resolutions" + + # Print some statistics about the results + total_places = len(results_df) + resolved_places = len(successful_results) + resolution_rate = (resolved_places / total_places) * 100 + print(f"Resolution statistics: {resolved_places}/{total_places} places resolved ({resolution_rate:.1f}%)") + + # Show some examples of successful and failed resolutions + print("\nSuccessful resolutions:") + print(successful_results[['place', 'standardize_label', 'latitude', 'longitude', 'source']].head()) + + failed_places = results_df[results_df['latitude'].isnull()] + if len(failed_places) > 0: + print(f"\nFailed to resolve {len(failed_places)} places:") + print(failed_places['place'].unique()[:10]) # Show first 10 unresolved places \ No newline at end of file From 660810f586c9ed27906d202c52acc5a1ec7be32a Mon Sep 17 00:00:00 2001 From: jairomelo Date: Mon, 14 Jul 2025 20:00:00 -0400 Subject: [PATCH 3/5] feat: add country_code and place_type columns to real DataFrame test for improved resolution --- tests/test_batch.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/test_batch.py b/tests/test_batch.py index a9cc75b..13dc618 100644 --- a/tests/test_batch.py +++ b/tests/test_batch.py @@ -118,10 +118,16 @@ def test_batch_resolver_list(): def test_batch_real_df(csv_path="tests/data/bautismos_cleaned.csv"): df = pd.read_csv(csv_path) + + df["country_code"] = "PE" + df["place_type"] = "city" + resolver = PlaceResolver([GeoNamesQuery(), WHGQuery()], verbose=True, lang="es") results_df = resolver.resolve_batch(df, place_column="Descriptor Geográfico 2", + country_column="country_code", + place_type_column="place_type", show_progress=True) print(f"\n=== Real DataFrame Results ===") print("Results DataFrame:") @@ -144,7 +150,7 @@ def test_batch_real_df(csv_path="tests/data/bautismos_cleaned.csv"): # Show some examples of successful and failed resolutions print("\nSuccessful resolutions:") print(successful_results[['place', 'standardize_label', 'latitude', 'longitude', 'source']].head()) - + failed_places = results_df[results_df['latitude'].isnull()] if len(failed_places) > 0: print(f"\nFailed to resolve {len(failed_places)} places:") From 7edc512ac7b4a41eeb0718c9d39b8e49659ecf59 Mon Sep 17 00:00:00 2001 From: jairomelo Date: Mon, 14 Jul 2025 20:08:29 -0400 Subject: [PATCH 4/5] revamp to v0.2.2 --- CHANGELOG.md | 16 ++++++++++++++++ pyproject.toml | 2 +- tests/test_batch.py | 2 +- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8e530b4..11c704d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,22 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [v0.2.2] - 2025-07-14 + +### Added +- GitHub Actions CI workflow for automated testing and validation +- Enhanced development dependencies in pyproject.toml for better development experience + +### Changed +- **PERFORMANCE**: Batch resolver now processes only unique values, significantly reducing API requests and improving performance +- Updated README with CI status badge + +### Tests +- Enhanced batch resolver tests with country_code and place_type columns for improved resolution testing +- Added comprehensive test coverage for unique value processing functionality + +--- + ## [v0.2.1] - 2025-07-10 ### Added diff --git a/pyproject.toml b/pyproject.toml index 4608845..9d433c6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "georesolver" -version = "0.2.1" +version = "0.2.2" description = "Multi-source place name to coordinates resolver using TGN, WHG, GeoNames, and Wikidata" authors = [ {name="Jairo Antonio Melo Florez", email="jairoantoniomelo@gmail.com"} diff --git a/tests/test_batch.py b/tests/test_batch.py index 13dc618..359fcf0 100644 --- a/tests/test_batch.py +++ b/tests/test_batch.py @@ -149,7 +149,7 @@ def test_batch_real_df(csv_path="tests/data/bautismos_cleaned.csv"): # Show some examples of successful and failed resolutions print("\nSuccessful resolutions:") - print(successful_results[['place', 'standardize_label', 'latitude', 'longitude', 'source']].head()) + print(successful_results[['place', 'country_code', 'standardize_label', 'latitude', 'longitude', 'source']].head()) failed_places = results_df[results_df['latitude'].isnull()] if len(failed_places) > 0: From ebe2f5fd0f723a9956a8b13ef92980b1e260b07e Mon Sep 17 00:00:00 2001 From: jairomelo Date: Mon, 14 Jul 2025 20:12:37 -0400 Subject: [PATCH 5/5] fix: comment out the test_batch_real_df function to prevent execution in CI --- tests/test_batch.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_batch.py b/tests/test_batch.py index 359fcf0..6136403 100644 --- a/tests/test_batch.py +++ b/tests/test_batch.py @@ -116,7 +116,7 @@ def test_batch_resolver_list(): assert isinstance(result['longitude'], (int, float)), "Longitude should be numeric" -def test_batch_real_df(csv_path="tests/data/bautismos_cleaned.csv"): +""" def test_batch_real_df(csv_path="tests/data/bautismos_cleaned.csv"): df = pd.read_csv(csv_path) df["country_code"] = "PE" @@ -154,4 +154,4 @@ def test_batch_real_df(csv_path="tests/data/bautismos_cleaned.csv"): failed_places = results_df[results_df['latitude'].isnull()] if len(failed_places) > 0: print(f"\nFailed to resolve {len(failed_places)} places:") - print(failed_places['place'].unique()[:10]) # Show first 10 unresolved places \ No newline at end of file + print(failed_places['place'].unique()[:10]) # Show first 10 unresolved places """ \ No newline at end of file