From 5f5c18d57db4872d5c77c792c7e788e320ab5274 Mon Sep 17 00:00:00 2001 From: cbrammer Date: Tue, 25 Mar 2025 17:01:05 -0600 Subject: [PATCH 1/5] Filter out items not in the date range of start_date --- tap_github/client.py | 72 ++++++++++++++------------------------------ 1 file changed, 23 insertions(+), 49 deletions(-) diff --git a/tap_github/client.py b/tap_github/client.py index 14568e4a..b6d11bfc 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -57,6 +57,9 @@ def url_base(self) -> str: replication_key: str | None = None tolerated_http_errors: ClassVar[list[int]] = [] + # Save the context from the requests so it can be available to the parse_response method + context: dict | None = None + @property def http_headers(self) -> dict[str, str]: """Return the http headers needed.""" @@ -142,6 +145,9 @@ def get_url_params( context: dict | None, next_page_token: Any | None, # noqa: ANN401 ) -> dict[str, Any]: + # save the context from the requests so it can be available to the parse_response method + self.context = context + """Return a dictionary of values to be used in URL parameterization.""" params: dict = {"per_page": self.MAX_PER_PAGE} if next_page_token: @@ -250,7 +256,7 @@ def validate_response(self, response: requests.Response) -> None: def parse_response(self, response: requests.Response) -> Iterable[dict]: """Parse the response and return an iterator of result rows.""" - # TODO - Split into handle_reponse and parse_response. + # TODO - Split into handle_response and parse_response. if response.status_code in ( [*self.tolerated_http_errors, EMPTY_REPO_ERROR_STATUS] ): @@ -259,8 +265,8 @@ def parse_response(self, response: requests.Response) -> Iterable[dict]: # Update token rate limit info and loop through tokens if needed. self.authenticator.update_rate_limit(response.headers) + # Get all items from the response resp_json = response.json() - if isinstance(resp_json, list): results = resp_json elif resp_json.get("items") is not None: @@ -268,7 +274,21 @@ def parse_response(self, response: requests.Response) -> Iterable[dict]: else: results = [resp_json] - yield from results + if not results: + return + + # Filter items based on replication key's date if needed + since = self.get_starting_timestamp(self.context) + filtered_results = [] + if self.replication_key and self.use_fake_since_parameter and since: + for item in results: + item_date = parse(item[self.replication_key]) + if item_date >= since: + filtered_results.append(item) + else: + filtered_results = results + + yield from filtered_results def post_process(self, row: dict, context: dict[str, str] | None = None) -> dict: """Add `repo_id` by default to all streams.""" @@ -306,52 +326,6 @@ def calculate_sync_cost( return {"rest": 1, "graphql": 0, "search": 0} -class GitHubDiffStream(GitHubRestStream): - """Base class for GitHub diff streams.""" - - @property - def http_headers(self) -> dict: - """Return the http headers needed for diff requests.""" - headers = super().http_headers - headers["Accept"] = "application/vnd.github.v3.diff" - return headers - - def parse_response(self, response: requests.Response) -> Iterable[dict]: - """Parse the response to yield the diff text instead of an object - and prevent buffer overflow.""" - if response.status_code != 200: - contents = response.json() - self.logger.info( - "Skipping %s due to %d error: %s", - self.name.replace("_", " "), - response.status_code, - contents["message"], - ) - yield { - "success": False, - "error_message": contents["message"], - } - return - - if content_length_str := response.headers.get("Content-Length"): - content_length = int(content_length_str) - max_size = 41_943_040 # 40 MiB - if content_length > max_size: - self.logger.info( - "Skipping %s. The diff size (%.2f MiB) exceeded the maximum" - " size limit of 40 MiB.", - self.name.replace("_", " "), - content_length / 1024 / 1024, - ) - yield { - "success": False, - "error_message": "Diff exceeded the maximum size limit of 40 MiB.", - } - return - - yield {"diff": response.text, "success": True} - - class GitHubGraphqlStream(GraphQLStream, GitHubRestStream): """GitHub Graphql stream class.""" From 1bee702541f6d9f312ded165fa1539a2a385107d Mon Sep 17 00:00:00 2001 From: cbrammer Date: Tue, 25 Mar 2025 17:08:30 -0600 Subject: [PATCH 2/5] Inadventantly removed the GitHubDiffStream, reversing --- tap_github/client.py | 46 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/tap_github/client.py b/tap_github/client.py index b6d11bfc..fee23591 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -326,6 +326,52 @@ def calculate_sync_cost( return {"rest": 1, "graphql": 0, "search": 0} +class GitHubDiffStream(GitHubRestStream): + """Base class for GitHub diff streams.""" + + @property + def http_headers(self) -> dict: + """Return the http headers needed for diff requests.""" + headers = super().http_headers + headers["Accept"] = "application/vnd.github.v3.diff" + return headers + + def parse_response(self, response: requests.Response) -> Iterable[dict]: + """Parse the response to yield the diff text instead of an object + and prevent buffer overflow.""" + if response.status_code != 200: + contents = response.json() + self.logger.info( + "Skipping %s due to %d error: %s", + self.name.replace("_", " "), + response.status_code, + contents["message"], + ) + yield { + "success": False, + "error_message": contents["message"], + } + return + + if content_length_str := response.headers.get("Content-Length"): + content_length = int(content_length_str) + max_size = 41_943_040 # 40 MiB + if content_length > max_size: + self.logger.info( + "Skipping %s. The diff size (%.2f MiB) exceeded the maximum" + " size limit of 40 MiB.", + self.name.replace("_", " "), + content_length / 1024 / 1024, + ) + yield { + "success": False, + "error_message": "Diff exceeded the maximum size limit of 40 MiB.", + } + return + + yield {"diff": response.text, "success": True} + + class GitHubGraphqlStream(GraphQLStream, GitHubRestStream): """GitHub Graphql stream class.""" From 3966cca666d4d19495397307b5637209402467fd Mon Sep 17 00:00:00 2001 From: cbrammer Date: Thu, 27 Mar 2025 15:41:11 -0600 Subject: [PATCH 3/5] Update tap_github/client.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Edgar Ramírez Mondragón --- tap_github/client.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tap_github/client.py b/tap_github/client.py index fee23591..ffd82430 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -57,9 +57,6 @@ def url_base(self) -> str: replication_key: str | None = None tolerated_http_errors: ClassVar[list[int]] = [] - # Save the context from the requests so it can be available to the parse_response method - context: dict | None = None - @property def http_headers(self) -> dict[str, str]: """Return the http headers needed.""" From 8a8b497325e7498901a67771ec0e9560c666fe29 Mon Sep 17 00:00:00 2001 From: cbrammer Date: Thu, 27 Mar 2025 15:41:17 -0600 Subject: [PATCH 4/5] Update tap_github/client.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Edgar Ramírez Mondragón --- tap_github/client.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tap_github/client.py b/tap_github/client.py index ffd82430..dfaff4e4 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -142,9 +142,6 @@ def get_url_params( context: dict | None, next_page_token: Any | None, # noqa: ANN401 ) -> dict[str, Any]: - # save the context from the requests so it can be available to the parse_response method - self.context = context - """Return a dictionary of values to be used in URL parameterization.""" params: dict = {"per_page": self.MAX_PER_PAGE} if next_page_token: From 04e447f60c54eee5897883cb8d8732f6d3abb627 Mon Sep 17 00:00:00 2001 From: cbrammer Date: Thu, 3 Apr 2025 20:36:19 -0600 Subject: [PATCH 5/5] Handle NUL values Sadly, the GitHub API returns NUL values in some fields. This function recursively replaces them with empty strings. Otherwise postgres will raise an error when inserting the data. --- tap_github/client.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tap_github/client.py b/tap_github/client.py index dfaff4e4..74bcd014 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -282,8 +282,30 @@ def parse_response(self, response: requests.Response) -> Iterable[dict]: else: filtered_results = results + # Replace NUL values + for item in filtered_results: + self.replace_nul_values(item) + yield from filtered_results + # Sadly, the GitHub API returns NUL values in some fields. + # This function recursively replaces them with empty strings. + # Otherwise postgres will raise an error when inserting the data. + def replace_nul_values(self, obj): + """Recursively replace NUL values in strings within dictionaries and lists.""" + if isinstance(obj, dict): + for key, value in obj.items(): + if isinstance(value, str): + obj[key] = value.replace("\x00", "") + elif isinstance(value, (dict, list)): + self.replace_nul_values(value) + elif isinstance(obj, list): + for i, item in enumerate(obj): + if isinstance(item, str): + obj[i] = item.replace("\x00", "") + elif isinstance(item, (dict, list)): + self.replace_nul_values(item) + def post_process(self, row: dict, context: dict[str, str] | None = None) -> dict: """Add `repo_id` by default to all streams.""" if context is not None and "repo_id" in context: