From 2d4100399184840d7dc250e1ed3fe5b9982b0ed3 Mon Sep 17 00:00:00 2001 From: joaoamaral <7281460+joaopamaral@users.noreply.github.com> Date: Fri, 21 Oct 2022 12:48:52 -0300 Subject: [PATCH 01/52] Adding a 10 second break to ensure the retry is not done before the API key reset --- tap_github/client.py | 3 ++- tests/unittests/test_rate_limit.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tap_github/client.py b/tap_github/client.py index 9913a8c2..20018195 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -128,9 +128,10 @@ def raise_for_error(resp, source, stream, client, should_skip_404): def calculate_seconds(epoch): """ Calculate the seconds to sleep before making a new request. + Note: a 10 second break to ensure the retry is not done before the reset """ current = time.time() - return int(round((epoch - current), 0)) + return int(round((epoch - current), 0)) + 10 def rate_throttling(response, max_sleep_seconds): """ diff --git a/tests/unittests/test_rate_limit.py b/tests/unittests/test_rate_limit.py index 987c60a0..c1bdf601 100644 --- a/tests/unittests/test_rate_limit.py +++ b/tests/unittests/test_rate_limit.py @@ -31,7 +31,7 @@ def test_rate_limt_wait(self, mocked_sleep): rate_throttling(resp, DEFAULT_SLEEP_SECONDS) # Verify `time.sleep` is called with expected seconds in response - mocked_sleep.assert_called_with(120) + mocked_sleep.assert_called_with(130) self.assertTrue(mocked_sleep.called) @@ -49,7 +49,7 @@ def test_rate_limit_exception(self, mocked_sleep): # Verify exception is raised with proper message with self.assertRaises(tap_github.client.RateLimitExceeded) as e: rate_throttling(resp, DEFAULT_SLEEP_SECONDS) - self.assertEqual(str(e.exception), "API rate limit exceeded, please try after 601 seconds.") + self.assertEqual(str(e.exception), "API rate limit exceeded, please try after 611 seconds.") def test_rate_limit_not_exceeded(self, mocked_sleep): From a256a8999c2bbb564f970ca42a76c231d154ddcb Mon Sep 17 00:00:00 2001 From: joaoamaral <7281460+joaopamaral@users.noreply.github.com> Date: Fri, 21 Oct 2022 13:13:48 -0300 Subject: [PATCH 02/52] Revert "Adding a 10 second break to ensure the retry is not done before the API key reset" This reverts commit 2d4100399184840d7dc250e1ed3fe5b9982b0ed3. --- tap_github/client.py | 3 +-- tests/unittests/test_rate_limit.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/tap_github/client.py b/tap_github/client.py index 20018195..9913a8c2 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -128,10 +128,9 @@ def raise_for_error(resp, source, stream, client, should_skip_404): def calculate_seconds(epoch): """ Calculate the seconds to sleep before making a new request. - Note: a 10 second break to ensure the retry is not done before the reset """ current = time.time() - return int(round((epoch - current), 0)) + 10 + return int(round((epoch - current), 0)) def rate_throttling(response, max_sleep_seconds): """ diff --git a/tests/unittests/test_rate_limit.py b/tests/unittests/test_rate_limit.py index c1bdf601..987c60a0 100644 --- a/tests/unittests/test_rate_limit.py +++ b/tests/unittests/test_rate_limit.py @@ -31,7 +31,7 @@ def test_rate_limt_wait(self, mocked_sleep): rate_throttling(resp, DEFAULT_SLEEP_SECONDS) # Verify `time.sleep` is called with expected seconds in response - mocked_sleep.assert_called_with(130) + mocked_sleep.assert_called_with(120) self.assertTrue(mocked_sleep.called) @@ -49,7 +49,7 @@ def test_rate_limit_exception(self, mocked_sleep): # Verify exception is raised with proper message with self.assertRaises(tap_github.client.RateLimitExceeded) as e: rate_throttling(resp, DEFAULT_SLEEP_SECONDS) - self.assertEqual(str(e.exception), "API rate limit exceeded, please try after 611 seconds.") + self.assertEqual(str(e.exception), "API rate limit exceeded, please try after 601 seconds.") def test_rate_limit_not_exceeded(self, mocked_sleep): From e2ebb1b15b3bf335a475e95cb37b34419129f768 Mon Sep 17 00:00:00 2001 From: joaoamaral <7281460+joaopamaral@users.noreply.github.com> Date: Fri, 21 Oct 2022 15:05:58 -0300 Subject: [PATCH 03/52] Adding configurable min rate limit and change round to ceil in calculate_seconds method --- tap_github/client.py | 11 +++++---- tests/unittests/test_rate_limit.py | 36 ++++++++++++++++++++++-------- 2 files changed, 34 insertions(+), 13 deletions(-) diff --git a/tap_github/client.py b/tap_github/client.py index 9913a8c2..ed2691ac 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -4,9 +4,11 @@ from simplejson import JSONDecodeError import singer from singer import metrics +from math import ceil LOGGER = singer.get_logger() DEFAULT_SLEEP_SECONDS = 600 +DEFAULT_MIN_RATE_LIMIT = 0 DEFAULT_DOMAIN = "https://api.github.com" # Set default timeout of 300 seconds @@ -130,14 +132,14 @@ def calculate_seconds(epoch): Calculate the seconds to sleep before making a new request. """ current = time.time() - return int(round((epoch - current), 0)) + return int(ceil(epoch - current)) -def rate_throttling(response, max_sleep_seconds): +def rate_throttling(response, max_sleep_seconds, min_rate_limit): """ For rate limit errors, get the remaining time before retrying and calculate the time to sleep before making a new request. """ if 'X-RateLimit-Remaining' in response.headers: - if int(response.headers['X-RateLimit-Remaining']) == 0: + if int(response.headers['X-RateLimit-Remaining']) <= min_rate_limit: seconds_to_sleep = calculate_seconds(int(response.headers['X-RateLimit-Reset'])) if seconds_to_sleep > max_sleep_seconds: @@ -160,6 +162,7 @@ def __init__(self, config): self.session = requests.Session() self.base_url = config['base_url'] if config.get('base_url') else DEFAULT_DOMAIN self.max_sleep_seconds = self.config.get('max_sleep_seconds', DEFAULT_SLEEP_SECONDS) + self.min_rate_limit = self.config.get('min_rate_limit', DEFAULT_MIN_RATE_LIMIT) self.set_auth_in_session() self.not_accessible_repos = set() @@ -198,7 +201,7 @@ def authed_get(self, source, url, headers={}, stream="", should_skip_404 = True) if resp.status_code != 200: raise_for_error(resp, source, stream, self, should_skip_404) timer.tags[metrics.Tag.http_status_code] = resp.status_code - rate_throttling(resp, self.max_sleep_seconds) + rate_throttling(resp, self.max_sleep_seconds, self.min_rate_limit) if resp.status_code == 404: # Return an empty response body since we're not raising a NotFoundException resp._content = b'{}' # pylint: disable=protected-access diff --git a/tests/unittests/test_rate_limit.py b/tests/unittests/test_rate_limit.py index 987c60a0..dd7bfd93 100644 --- a/tests/unittests/test_rate_limit.py +++ b/tests/unittests/test_rate_limit.py @@ -2,10 +2,12 @@ from tap_github.client import rate_throttling, GithubException import unittest from unittest import mock +from math import ceil import time import requests DEFAULT_SLEEP_SECONDS = 600 +DEFAULT_MIN_RATE_LIMIT = 0 def api_call(): return requests.get("https://api.github.com/rate_limit") @@ -25,13 +27,13 @@ def test_rate_limt_wait(self, mocked_sleep): mocked_sleep.side_effect = None resp = api_call() - resp.headers["X-RateLimit-Reset"] = int(round(time.time(), 0)) + 120 + resp.headers["X-RateLimit-Reset"] = int(ceil(time.time())) + 120 resp.headers["X-RateLimit-Remaining"] = 0 - rate_throttling(resp, DEFAULT_SLEEP_SECONDS) + rate_throttling(resp, DEFAULT_SLEEP_SECONDS, DEFAULT_MIN_RATE_LIMIT) # Verify `time.sleep` is called with expected seconds in response - mocked_sleep.assert_called_with(120) + mocked_sleep.assert_called_with(121) self.assertTrue(mocked_sleep.called) @@ -43,13 +45,13 @@ def test_rate_limit_exception(self, mocked_sleep): mocked_sleep.side_effect = None resp = api_call() - resp.headers["X-RateLimit-Reset"] = int(round(time.time(), 0)) + 601 + resp.headers["X-RateLimit-Reset"] = int(ceil(time.time())) + 601 resp.headers["X-RateLimit-Remaining"] = 0 # Verify exception is raised with proper message with self.assertRaises(tap_github.client.RateLimitExceeded) as e: - rate_throttling(resp, DEFAULT_SLEEP_SECONDS) - self.assertEqual(str(e.exception), "API rate limit exceeded, please try after 601 seconds.") + rate_throttling(resp, DEFAULT_SLEEP_SECONDS, DEFAULT_MIN_RATE_LIMIT) + self.assertEqual(str(e.exception), "API rate limit exceeded, please try after 602 seconds.") def test_rate_limit_not_exceeded(self, mocked_sleep): @@ -60,14 +62,30 @@ def test_rate_limit_not_exceeded(self, mocked_sleep): mocked_sleep.side_effect = None resp = api_call() - resp.headers["X-RateLimit-Reset"] = int(round(time.time(), 0)) + 10 + resp.headers["X-RateLimit-Reset"] = int(ceil(time.time())) + 10 resp.headers["X-RateLimit-Remaining"] = 5 - rate_throttling(resp, DEFAULT_SLEEP_SECONDS) + rate_throttling(resp, DEFAULT_SLEEP_SECONDS, DEFAULT_MIN_RATE_LIMIT) # Verify that `time.sleep` is not called self.assertFalse(mocked_sleep.called) + def test_rate_limit_wait_with_min_rate_limit_defined(self, mocked_sleep): + """ + Test `rate_throttling` if sleep time does exceed limit with a different value for the `min_rate_limit` + """ + + mocked_sleep.side_effect = None + + resp = api_call() + resp.headers["X-RateLimit-Reset"] = int(ceil(time.time())) + 10 + resp.headers["X-RateLimit-Remaining"] = 5 + + rate_throttling(resp, DEFAULT_SLEEP_SECONDS, min_rate_limit=5) + + # Verify `time.sleep` is called + self.assertTrue(mocked_sleep.called) + def test_rate_limt_header_not_found(self, mocked_sleep): """ Test that the `rate_throttling` function raises an exception if `X-RateLimit-Reset` key is not found in the header. @@ -76,7 +94,7 @@ def test_rate_limt_header_not_found(self, mocked_sleep): resp.headers={} with self.assertRaises(GithubException) as e: - rate_throttling(resp, DEFAULT_SLEEP_SECONDS) + rate_throttling(resp, DEFAULT_SLEEP_SECONDS, DEFAULT_MIN_RATE_LIMIT) # Verifying the message formed for the invalid base URL self.assertEqual(str(e.exception), "The API call using the specified base url was unsuccessful. Please double-check the provided base URL.") From f01ceb6bdafa79ade529d50fcdc9a24128089e3d Mon Sep 17 00:00:00 2001 From: joaoamaral <7281460+joaopamaral@users.noreply.github.com> Date: Fri, 21 Oct 2022 16:59:41 -0300 Subject: [PATCH 04/52] Fix comment in test_rate_limit_wait_with_min_rate_limit_defined test --- tests/unittests/test_rate_limit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unittests/test_rate_limit.py b/tests/unittests/test_rate_limit.py index dd7bfd93..52ac0fc0 100644 --- a/tests/unittests/test_rate_limit.py +++ b/tests/unittests/test_rate_limit.py @@ -72,7 +72,7 @@ def test_rate_limit_not_exceeded(self, mocked_sleep): def test_rate_limit_wait_with_min_rate_limit_defined(self, mocked_sleep): """ - Test `rate_throttling` if sleep time does exceed limit with a different value for the `min_rate_limit` + Test `rate_throttling` if remain rate limit > 0 and equal to `min_rate_limit` """ mocked_sleep.side_effect = None From 4ecaaaefc22de4b3d067514521739f6eca8a8e5e Mon Sep 17 00:00:00 2001 From: joaoamaral <7281460+joaopamaral@users.noreply.github.com> Date: Fri, 21 Oct 2022 17:53:12 -0300 Subject: [PATCH 05/52] Adding `remain` to variable names --- tap_github/client.py | 10 +++++----- tests/unittests/test_rate_limit.py | 16 ++++++++-------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/tap_github/client.py b/tap_github/client.py index ed2691ac..e8a11d4c 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -8,7 +8,7 @@ LOGGER = singer.get_logger() DEFAULT_SLEEP_SECONDS = 600 -DEFAULT_MIN_RATE_LIMIT = 0 +DEFAULT_MIN_REMAIN_RATE_LIMIT = 0 DEFAULT_DOMAIN = "https://api.github.com" # Set default timeout of 300 seconds @@ -134,12 +134,12 @@ def calculate_seconds(epoch): current = time.time() return int(ceil(epoch - current)) -def rate_throttling(response, max_sleep_seconds, min_rate_limit): +def rate_throttling(response, max_sleep_seconds, min_remain_rate_limit): """ For rate limit errors, get the remaining time before retrying and calculate the time to sleep before making a new request. """ if 'X-RateLimit-Remaining' in response.headers: - if int(response.headers['X-RateLimit-Remaining']) <= min_rate_limit: + if int(response.headers['X-RateLimit-Remaining']) <= min_remain_rate_limit: seconds_to_sleep = calculate_seconds(int(response.headers['X-RateLimit-Reset'])) if seconds_to_sleep > max_sleep_seconds: @@ -162,7 +162,7 @@ def __init__(self, config): self.session = requests.Session() self.base_url = config['base_url'] if config.get('base_url') else DEFAULT_DOMAIN self.max_sleep_seconds = self.config.get('max_sleep_seconds', DEFAULT_SLEEP_SECONDS) - self.min_rate_limit = self.config.get('min_rate_limit', DEFAULT_MIN_RATE_LIMIT) + self.min_remain_rate_limit = self.config.get('min_remain_rate_limit', DEFAULT_MIN_REMAIN_RATE_LIMIT) self.set_auth_in_session() self.not_accessible_repos = set() @@ -201,7 +201,7 @@ def authed_get(self, source, url, headers={}, stream="", should_skip_404 = True) if resp.status_code != 200: raise_for_error(resp, source, stream, self, should_skip_404) timer.tags[metrics.Tag.http_status_code] = resp.status_code - rate_throttling(resp, self.max_sleep_seconds, self.min_rate_limit) + rate_throttling(resp, self.max_sleep_seconds, self.min_remain_rate_limit) if resp.status_code == 404: # Return an empty response body since we're not raising a NotFoundException resp._content = b'{}' # pylint: disable=protected-access diff --git a/tests/unittests/test_rate_limit.py b/tests/unittests/test_rate_limit.py index 52ac0fc0..4de4ef98 100644 --- a/tests/unittests/test_rate_limit.py +++ b/tests/unittests/test_rate_limit.py @@ -7,7 +7,7 @@ import requests DEFAULT_SLEEP_SECONDS = 600 -DEFAULT_MIN_RATE_LIMIT = 0 +DEFAULT_MIN_REMAIN_RATE_LIMIT = 0 def api_call(): return requests.get("https://api.github.com/rate_limit") @@ -30,7 +30,7 @@ def test_rate_limt_wait(self, mocked_sleep): resp.headers["X-RateLimit-Reset"] = int(ceil(time.time())) + 120 resp.headers["X-RateLimit-Remaining"] = 0 - rate_throttling(resp, DEFAULT_SLEEP_SECONDS, DEFAULT_MIN_RATE_LIMIT) + rate_throttling(resp, DEFAULT_SLEEP_SECONDS, DEFAULT_MIN_REMAIN_RATE_LIMIT) # Verify `time.sleep` is called with expected seconds in response mocked_sleep.assert_called_with(121) @@ -50,7 +50,7 @@ def test_rate_limit_exception(self, mocked_sleep): # Verify exception is raised with proper message with self.assertRaises(tap_github.client.RateLimitExceeded) as e: - rate_throttling(resp, DEFAULT_SLEEP_SECONDS, DEFAULT_MIN_RATE_LIMIT) + rate_throttling(resp, DEFAULT_SLEEP_SECONDS, DEFAULT_MIN_REMAIN_RATE_LIMIT) self.assertEqual(str(e.exception), "API rate limit exceeded, please try after 602 seconds.") @@ -65,14 +65,14 @@ def test_rate_limit_not_exceeded(self, mocked_sleep): resp.headers["X-RateLimit-Reset"] = int(ceil(time.time())) + 10 resp.headers["X-RateLimit-Remaining"] = 5 - rate_throttling(resp, DEFAULT_SLEEP_SECONDS, DEFAULT_MIN_RATE_LIMIT) + rate_throttling(resp, DEFAULT_SLEEP_SECONDS, DEFAULT_MIN_REMAIN_RATE_LIMIT) # Verify that `time.sleep` is not called self.assertFalse(mocked_sleep.called) - def test_rate_limit_wait_with_min_rate_limit_defined(self, mocked_sleep): + def test_rate_limit_wait_with_min_remain_rate_limit_defined(self, mocked_sleep): """ - Test `rate_throttling` if remain rate limit > 0 and equal to `min_rate_limit` + Test `rate_throttling` if remain rate limit > 0 and equal to `min_remain_rate_limit` """ mocked_sleep.side_effect = None @@ -81,7 +81,7 @@ def test_rate_limit_wait_with_min_rate_limit_defined(self, mocked_sleep): resp.headers["X-RateLimit-Reset"] = int(ceil(time.time())) + 10 resp.headers["X-RateLimit-Remaining"] = 5 - rate_throttling(resp, DEFAULT_SLEEP_SECONDS, min_rate_limit=5) + rate_throttling(resp, DEFAULT_SLEEP_SECONDS, min_remain_rate_limit=5) # Verify `time.sleep` is called self.assertTrue(mocked_sleep.called) @@ -94,7 +94,7 @@ def test_rate_limt_header_not_found(self, mocked_sleep): resp.headers={} with self.assertRaises(GithubException) as e: - rate_throttling(resp, DEFAULT_SLEEP_SECONDS, DEFAULT_MIN_RATE_LIMIT) + rate_throttling(resp, DEFAULT_SLEEP_SECONDS, DEFAULT_MIN_REMAIN_RATE_LIMIT) # Verifying the message formed for the invalid base URL self.assertEqual(str(e.exception), "The API call using the specified base url was unsuccessful. Please double-check the provided base URL.") From 6be3a2a51755709fe5d37dc5920412de13fdbbdf Mon Sep 17 00:00:00 2001 From: joaoamaral <7281460+joaopamaral@users.noreply.github.com> Date: Thu, 12 Jan 2023 13:51:16 -0300 Subject: [PATCH 06/52] Stopping requests on last page and increasing max per page --- tap_github/client.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/tap_github/client.py b/tap_github/client.py index e8a11d4c..2120230e 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -1,5 +1,6 @@ import time import requests +from requests.models import PreparedRequest import backoff from simplejson import JSONDecodeError import singer @@ -9,6 +10,7 @@ LOGGER = singer.get_logger() DEFAULT_SLEEP_SECONDS = 600 DEFAULT_MIN_REMAIN_RATE_LIMIT = 0 +DEFAULT_MAX_PER_PAGE = 100 DEFAULT_DOMAIN = "https://api.github.com" # Set default timeout of 300 seconds @@ -97,7 +99,7 @@ class TooManyRequests(GithubException): } } -def raise_for_error(resp, source, stream, client, should_skip_404): +def raise_for_error(resp, source, stream, client, should_skip_404, should_skip_422): """ Retrieve the error code and the error message from the response and return custom exceptions accordingly. """ @@ -118,6 +120,13 @@ def raise_for_error(resp, source, stream, client, should_skip_404): # Don't raise a NotFoundException return None + if error_code == 422 and should_skip_422: + message = ("HTTP-error-code: 404, Error: {}. Please refer \'{}\' for more details. " + "The next pages will be skipped due to Github API limit of 40k records.").format( + response_json.get('message'), response_json.get("documentation_url")) + LOGGER.warning(message) + return None + message = "HTTP-error-code: {}, Error: {}".format( error_code, ERROR_CODE_EXCEPTION_MAPPING.get(error_code, {}).get("message", "Unknown Error") if response_json == {} else response_json) @@ -165,6 +174,7 @@ def __init__(self, config): self.min_remain_rate_limit = self.config.get('min_remain_rate_limit', DEFAULT_MIN_REMAIN_RATE_LIMIT) self.set_auth_in_session() self.not_accessible_repos = set() + self.max_per_page = self.config.get('max_per_page', DEFAULT_MAX_PER_PAGE) def get_request_timeout(self): """ @@ -211,12 +221,16 @@ def authed_get_all_pages(self, source, url, headers={}, stream="", should_skip_4 """ Fetch all pages of records and return them. """ + prepared_request = PreparedRequest() + prepared_request.prepare_url(url, {'per_page': self.max_per_page}) + url = prepared_request.url + LOGGER.info(url) while True: r = self.authed_get(source, url, headers, stream, should_skip_404) yield r # Fetch the next page if next found in the response. - if 'next' in r.links: + if 'next' in r.links and r.links['last'] != url: url = r.links['next']['url'] else: # Break the loop if all pages are fetched. From ac14f7894fe597d1ef5343af734eaa93a1f7f37b Mon Sep 17 00:00:00 2001 From: joaoamaral <7281460+joaopamaral@users.noreply.github.com> Date: Thu, 12 Jan 2023 14:40:38 -0300 Subject: [PATCH 07/52] Extracting prepare_url to method --- README.md | 6 +++++- tap_github/client.py | 23 +++++++++++------------ tests/unittests/test_custom_domain.py | 15 +++++++++++++++ 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index e8c4df01..aa55f97c 100644 --- a/README.md +++ b/README.md @@ -63,9 +63,13 @@ This tap: "repository": "singer-io/tap-github singer-io/getting-started", "start_date": "2021-01-01T00:00:00Z", "request_timeout": 300, - "base_url": "https://api.github.com" + "base_url": "https://api.github.com", } ``` + +> Note: The max results per page is configurable with the parameter `max_per_page`, +> as default it will return 100 (that is the max of most of the endpoints) + 4. Run the tap in discovery mode to get properties.json file ```bash diff --git a/tap_github/client.py b/tap_github/client.py index 2120230e..12b203e5 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -99,7 +99,7 @@ class TooManyRequests(GithubException): } } -def raise_for_error(resp, source, stream, client, should_skip_404, should_skip_422): +def raise_for_error(resp, source, stream, client, should_skip_404): """ Retrieve the error code and the error message from the response and return custom exceptions accordingly. """ @@ -120,13 +120,6 @@ def raise_for_error(resp, source, stream, client, should_skip_404, should_skip_4 # Don't raise a NotFoundException return None - if error_code == 422 and should_skip_422: - message = ("HTTP-error-code: 404, Error: {}. Please refer \'{}\' for more details. " - "The next pages will be skipped due to Github API limit of 40k records.").format( - response_json.get('message'), response_json.get("documentation_url")) - LOGGER.warning(message) - return None - message = "HTTP-error-code: {}, Error: {}".format( error_code, ERROR_CODE_EXCEPTION_MAPPING.get(error_code, {}).get("message", "Unknown Error") if response_json == {} else response_json) @@ -221,10 +214,7 @@ def authed_get_all_pages(self, source, url, headers={}, stream="", should_skip_4 """ Fetch all pages of records and return them. """ - prepared_request = PreparedRequest() - prepared_request.prepare_url(url, {'per_page': self.max_per_page}) - url = prepared_request.url - LOGGER.info(url) + url = self.prepare_url(url) while True: r = self.authed_get(source, url, headers, stream, should_skip_404) yield r @@ -236,6 +226,15 @@ def authed_get_all_pages(self, source, url, headers={}, stream="", should_skip_4 # Break the loop if all pages are fetched. break + def prepare_url(self, url): + """ + Prepare the URL with some additional parameters + """ + prepared_request = PreparedRequest() + # Including max per page param + prepared_request.prepare_url(url, {'per_page': self.max_per_page}) + return prepared_request.url + def verify_repo_access(self, url_for_repo, repo): """ Call rest API to verify that the user has sufficient permissions to access this repository. diff --git a/tests/unittests/test_custom_domain.py b/tests/unittests/test_custom_domain.py index 139b2426..d996da18 100644 --- a/tests/unittests/test_custom_domain.py +++ b/tests/unittests/test_custom_domain.py @@ -27,3 +27,18 @@ def test_config_with_domain(self, mock_verify_access): # Verify domain in client is from config self.assertEqual(test_client.base_url, mock_config["base_url"]) + + def test_prepare_url(self, mock_verify_access): + """ + Test if the correct params are added to url + """ + mock_config = {'repository': 'singer-io/test-repo', "base_url": "http://CUSTOM-git.com", "access_token": "", "max_per_page": 35} + test_client = GithubClient(mock_config) + + # Verify domain in client is from config + self.assertEqual(test_client.prepare_url(test_client.base_url), f"{mock_config['base_url'].lower()}/?per_page=35") + self.assertEqual(test_client.prepare_url('http://CUSTOM-git.com/?q=query'), 'http://custom-git.com/?q=query&per_page=35') + + del mock_config["max_per_page"] + test_client2 = GithubClient(mock_config) + self.assertEqual(test_client2.prepare_url(test_client2.base_url), f"{mock_config['base_url'].lower()}/?per_page=100") \ No newline at end of file From 7c9fdae152b56cb2e71004a79dba6c18691e486e Mon Sep 17 00:00:00 2001 From: joaoamaral <7281460+joaopamaral@users.noreply.github.com> Date: Thu, 12 Jan 2023 14:52:27 -0300 Subject: [PATCH 08/52] nit --- README.md | 2 +- tests/unittests/test_custom_domain.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index aa55f97c..7b8e8845 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ This tap: "repository": "singer-io/tap-github singer-io/getting-started", "start_date": "2021-01-01T00:00:00Z", "request_timeout": 300, - "base_url": "https://api.github.com", + "base_url": "https://api.github.com" } ``` diff --git a/tests/unittests/test_custom_domain.py b/tests/unittests/test_custom_domain.py index d996da18..c719d45c 100644 --- a/tests/unittests/test_custom_domain.py +++ b/tests/unittests/test_custom_domain.py @@ -41,4 +41,4 @@ def test_prepare_url(self, mock_verify_access): del mock_config["max_per_page"] test_client2 = GithubClient(mock_config) - self.assertEqual(test_client2.prepare_url(test_client2.base_url), f"{mock_config['base_url'].lower()}/?per_page=100") \ No newline at end of file + self.assertEqual(test_client2.prepare_url(test_client2.base_url), f"{mock_config['base_url'].lower()}/?per_page=100") From 2d361d341d5f511094dff71ed8171eb752761f3e Mon Sep 17 00:00:00 2001 From: joaoamaral <7281460+joaopamaral@users.noreply.github.com> Date: Thu, 12 Jan 2023 14:54:01 -0300 Subject: [PATCH 09/52] nit --- tests/unittests/test_custom_domain.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/unittests/test_custom_domain.py b/tests/unittests/test_custom_domain.py index c719d45c..218c618f 100644 --- a/tests/unittests/test_custom_domain.py +++ b/tests/unittests/test_custom_domain.py @@ -35,10 +35,11 @@ def test_prepare_url(self, mock_verify_access): mock_config = {'repository': 'singer-io/test-repo', "base_url": "http://CUSTOM-git.com", "access_token": "", "max_per_page": 35} test_client = GithubClient(mock_config) - # Verify domain in client is from config + # Verify if per_page param was added as expected self.assertEqual(test_client.prepare_url(test_client.base_url), f"{mock_config['base_url'].lower()}/?per_page=35") self.assertEqual(test_client.prepare_url('http://CUSTOM-git.com/?q=query'), 'http://custom-git.com/?q=query&per_page=35') + # Verify if per_page param was added with default value del mock_config["max_per_page"] test_client2 = GithubClient(mock_config) self.assertEqual(test_client2.prepare_url(test_client2.base_url), f"{mock_config['base_url'].lower()}/?per_page=100") From 8c2d2812dcdc8ae60c4680ac64ae6489a647f41d Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Thu, 19 Jan 2023 08:32:08 -0300 Subject: [PATCH 10/52] Returning when seconds to sleep is 0 --- tap_github/client.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tap_github/client.py b/tap_github/client.py index 12b203e5..7372aae0 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -143,6 +143,8 @@ def rate_throttling(response, max_sleep_seconds, min_remain_rate_limit): if 'X-RateLimit-Remaining' in response.headers: if int(response.headers['X-RateLimit-Remaining']) <= min_remain_rate_limit: seconds_to_sleep = calculate_seconds(int(response.headers['X-RateLimit-Reset'])) + if seconds_to_sleep <= 0: + return if seconds_to_sleep > max_sleep_seconds: message = "API rate limit exceeded, please try after {} seconds.".format(seconds_to_sleep) From a67f7887abf27c206e609c302ee3dc9dba3136c4 Mon Sep 17 00:00:00 2001 From: joaoamaral <7281460+joaopamaral@users.noreply.github.com> Date: Thu, 19 Jan 2023 11:17:31 -0300 Subject: [PATCH 11/52] Refactoring tests --- tests/unittests/test_custom_domain.py | 28 +++++++++++++-------------- tests/unittests/test_rate_limit.py | 2 -- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/tests/unittests/test_custom_domain.py b/tests/unittests/test_custom_domain.py index 218c618f..3a517786 100644 --- a/tests/unittests/test_custom_domain.py +++ b/tests/unittests/test_custom_domain.py @@ -12,8 +12,8 @@ def test_config_without_domain(self, mock_verify_access): """ Test if the domain is not given in the config """ - mock_config = {'repository': 'singer-io/test-repo', "access_token": ""} - test_client = GithubClient(mock_config) + config = {'repository': 'singer-io/test-repo', "access_token": ""} + test_client = GithubClient(config) # Verify domain in client is default self.assertEqual(test_client.base_url, DEFAULT_DOMAIN) @@ -22,24 +22,24 @@ def test_config_with_domain(self, mock_verify_access): """ Test if the domain is given in the config """ - mock_config = {'repository': 'singer-io/test-repo', "base_url": "http://CUSTOM-git.com", "access_token": ""} - test_client = GithubClient(mock_config) + config = {'repository': 'singer-io/test-repo', "base_url": "http://CUSTOM-git.com", "access_token": ""} + test_client = GithubClient(config) # Verify domain in client is from config - self.assertEqual(test_client.base_url, mock_config["base_url"]) + self.assertEqual(test_client.base_url, config["base_url"]) def test_prepare_url(self, mock_verify_access): """ Test if the correct params are added to url """ - mock_config = {'repository': 'singer-io/test-repo', "base_url": "http://CUSTOM-git.com", "access_token": "", "max_per_page": 35} - test_client = GithubClient(mock_config) - - # Verify if per_page param was added as expected - self.assertEqual(test_client.prepare_url(test_client.base_url), f"{mock_config['base_url'].lower()}/?per_page=35") - self.assertEqual(test_client.prepare_url('http://CUSTOM-git.com/?q=query'), 'http://custom-git.com/?q=query&per_page=35') + config = {'repository': 'singer-io/test-repo', "base_url": "http://CUSTOM-git.com", "access_token": ""} + test_client = GithubClient(config) # Verify if per_page param was added with default value - del mock_config["max_per_page"] - test_client2 = GithubClient(mock_config) - self.assertEqual(test_client2.prepare_url(test_client2.base_url), f"{mock_config['base_url'].lower()}/?per_page=100") + self.assertEqual(test_client.prepare_url(test_client.base_url), "http://custom-git.com/?per_page=100") + self.assertEqual(test_client.prepare_url('http://CUSTOM-git.com/?q=query'), 'http://custom-git.com/?q=query&per_page=100') + + # Verify if per_page param was added as expected + config["max_per_page"] = 35 + test_client2 = GithubClient(config) + self.assertEqual(test_client2.prepare_url(test_client2.base_url), "http://custom-git.com/?per_page=35") diff --git a/tests/unittests/test_rate_limit.py b/tests/unittests/test_rate_limit.py index 4de4ef98..770acd29 100644 --- a/tests/unittests/test_rate_limit.py +++ b/tests/unittests/test_rate_limit.py @@ -36,7 +36,6 @@ def test_rate_limt_wait(self, mocked_sleep): mocked_sleep.assert_called_with(121) self.assertTrue(mocked_sleep.called) - def test_rate_limit_exception(self, mocked_sleep): """ Test `rate_throttling` for 'sleep_time' greater than `MAX_SLEEP_SECONDS` @@ -53,7 +52,6 @@ def test_rate_limit_exception(self, mocked_sleep): rate_throttling(resp, DEFAULT_SLEEP_SECONDS, DEFAULT_MIN_REMAIN_RATE_LIMIT) self.assertEqual(str(e.exception), "API rate limit exceeded, please try after 602 seconds.") - def test_rate_limit_not_exceeded(self, mocked_sleep): """ Test `rate_throttling` if sleep time does not exceed limit From 077f8f4247d0e61eff4c9f1d06aeeb39a52994ff Mon Sep 17 00:00:00 2001 From: joaoamaral <7281460+joaopamaral@users.noreply.github.com> Date: Thu, 19 Jan 2023 11:51:27 -0300 Subject: [PATCH 12/52] Fixing TestAuthedGetAllPages tests ti include per_page argument --- tests/unittests/test_get_all_repos.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/unittests/test_get_all_repos.py b/tests/unittests/test_get_all_repos.py index 9235acad..5db16fef 100644 --- a/tests/unittests/test_get_all_repos.py +++ b/tests/unittests/test_get_all_repos.py @@ -95,7 +95,7 @@ class TestAuthedGetAllPages(unittest.TestCase): """ Test `authed_get_all_pages` method from client. """ - config = {"access_token": "", "repository": "test-org/repo1"} + config = {"access_token": "", "repository": "test-org/repo1", "max_per_page": 100} def test_for_one_page(self, mock_auth_get, mock_verify_access): @@ -104,7 +104,7 @@ def test_for_one_page(self, mock_auth_get, mock_verify_access): test_client = GithubClient(self.config) mock_auth_get.return_value = MockResponse({}) - list(test_client.authed_get_all_pages("", "mock_url", {})) + list(test_client.authed_get_all_pages("", "http://mock_url", {})) # Verify `auth_get` call count self.assertEqual(mock_auth_get.call_count, 1) @@ -114,14 +114,15 @@ def test_for_multiple_pages(self, mock_auth_get, mock_verify_access): """Verify `authed_get` is called equal number times as pages available.""" test_client = GithubClient(self.config) - mock_auth_get.side_effect = [MockResponse({"next": {"url": "mock_url_2"}}),MockResponse({"next": {"url": "mock_url_3"}}),MockResponse({})] + mock_auth_get.side_effect = [MockResponse({"next": {"url": "http://mock_url_2/?per_page=100"}}), + MockResponse({"next": {"url": "http://mock_url_3/?per_page=100"}}),MockResponse({})] - list(test_client.authed_get_all_pages("", "mock_url_1", {})) + list(test_client.authed_get_all_pages("", "http://mock_url_1", {})) # Verify `auth_get` call count self.assertEqual(mock_auth_get.call_count, 3) # Verify `auth_get` calls with expected url - self.assertEqual(mock_auth_get.mock_calls[0], mock.call("", "mock_url_1", {}, '', True)) - self.assertEqual(mock_auth_get.mock_calls[1], mock.call("", "mock_url_2", {}, '', True)) - self.assertEqual(mock_auth_get.mock_calls[2], mock.call("", "mock_url_3", {}, '', True)) + self.assertEqual(mock_auth_get.mock_calls[0], mock.call("", "http://mock_url_1/?per_page=100", {}, '', True)) + self.assertEqual(mock_auth_get.mock_calls[1], mock.call("", "http://mock_url_2/?per_page=100", {}, '', True)) + self.assertEqual(mock_auth_get.mock_calls[2], mock.call("", "http://mock_url_3/?per_page=100", {}, '', True)) From 0ad5a283019b31de6e87412c193efafe48be23bd Mon Sep 17 00:00:00 2001 From: joaoamaral <7281460+joaopamaral@users.noreply.github.com> Date: Thu, 19 Jan 2023 12:12:47 -0300 Subject: [PATCH 13/52] Fixing pagination exceed limit issue --- tap_github/client.py | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/tap_github/client.py b/tap_github/client.py index 7372aae0..160ca1ec 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -16,6 +16,8 @@ # Set default timeout of 300 seconds REQUEST_TIMEOUT = 300 +PAGINATION_EXCEED_MSG = 'In order to keep the API fast for everyone, pagination is limited for this resource.' + class GithubException(Exception): pass @@ -120,6 +122,14 @@ def raise_for_error(resp, source, stream, client, should_skip_404): # Don't raise a NotFoundException return None + if error_code == 422 and PAGINATION_EXCEED_MSG in response_json.get('message', ''): + message = f"HTTP-error-code: 422, Error: {response_json.get('message', '')}. " \ + f"Please refer '{response_json.get('documentation_url')}' for more details." \ + "This is a known issue when the results exceed 40k and the last page is not full" \ + " (it will trim the results to get only the available by the API)." + LOGGER.warning(message) + return None + message = "HTTP-error-code: {}, Error: {}".format( error_code, ERROR_CODE_EXCEPTION_MAPPING.get(error_code, {}).get("message", "Unknown Error") if response_json == {} else response_json) @@ -134,7 +144,7 @@ def calculate_seconds(epoch): Calculate the seconds to sleep before making a new request. """ current = time.time() - return int(ceil(epoch - current)) + return max(0, int(ceil(epoch - current))) def rate_throttling(response, max_sleep_seconds, min_remain_rate_limit): """ @@ -143,8 +153,6 @@ def rate_throttling(response, max_sleep_seconds, min_remain_rate_limit): if 'X-RateLimit-Remaining' in response.headers: if int(response.headers['X-RateLimit-Remaining']) <= min_remain_rate_limit: seconds_to_sleep = calculate_seconds(int(response.headers['X-RateLimit-Reset'])) - if seconds_to_sleep <= 0: - return if seconds_to_sleep > max_sleep_seconds: message = "API rate limit exceeded, please try after {} seconds.".format(seconds_to_sleep) @@ -207,9 +215,9 @@ def authed_get(self, source, url, headers={}, stream="", should_skip_404 = True) raise_for_error(resp, source, stream, self, should_skip_404) timer.tags[metrics.Tag.http_status_code] = resp.status_code rate_throttling(resp, self.max_sleep_seconds, self.min_remain_rate_limit) - if resp.status_code == 404: + if resp.status_code == 404 or resp.status_code == 422: # Return an empty response body since we're not raising a NotFoundException - resp._content = b'{}' # pylint: disable=protected-access + resp._content = b'{}' # pylint: disable=protected-access return resp def authed_get_all_pages(self, source, url, headers={}, stream="", should_skip_404 = True): @@ -217,16 +225,17 @@ def authed_get_all_pages(self, source, url, headers={}, stream="", should_skip_4 Fetch all pages of records and return them. """ url = self.prepare_url(url) - while True: - r = self.authed_get(source, url, headers, stream, should_skip_404) - yield r + next_page = True + while next_page: + response = self.authed_get(source, url, headers, stream, should_skip_404) + yield response # Fetch the next page if next found in the response. - if 'next' in r.links and r.links['last'] != url: - url = r.links['next']['url'] + if 'next' in response.links: + url = response.links['next']['url'] else: - # Break the loop if all pages are fetched. - break + # Break the loop if all pages are fetched. + next_page = False def prepare_url(self, url): """ From 786438925aec2f21df9ae2da8d8c9089311b4f5e Mon Sep 17 00:00:00 2001 From: joaoamaral <7281460+joaopamaral@users.noreply.github.com> Date: Thu, 19 Jan 2023 12:17:16 -0300 Subject: [PATCH 14/52] Simplifying the pagination loop --- tap_github/client.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/tap_github/client.py b/tap_github/client.py index 160ca1ec..cd6aca70 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -224,18 +224,12 @@ def authed_get_all_pages(self, source, url, headers={}, stream="", should_skip_4 """ Fetch all pages of records and return them. """ - url = self.prepare_url(url) - next_page = True - while next_page: + next_url = self.prepare_url(url) + while next_url: response = self.authed_get(source, url, headers, stream, should_skip_404) yield response - # Fetch the next page if next found in the response. - if 'next' in response.links: - url = response.links['next']['url'] - else: - # Break the loop if all pages are fetched. - next_page = False + next_url = response.links.get('next', None) def prepare_url(self, url): """ From d780571ebf792c88e91e8aa4f2ed5bbb1a1a1f4b Mon Sep 17 00:00:00 2001 From: joaoamaral <7281460+joaopamaral@users.noreply.github.com> Date: Thu, 19 Jan 2023 13:10:21 -0300 Subject: [PATCH 15/52] Logging url instead of source --- tap_github/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_github/client.py b/tap_github/client.py index cd6aca70..b7058367 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -208,7 +208,7 @@ def authed_get(self, source, url, headers={}, stream="", should_skip_404 = True) """ Call rest API and return the response in case of status code 200. """ - with metrics.http_request_timer(source) as timer: + with metrics.http_request_timer(url) as timer: self.session.headers.update(headers) resp = self.session.request(method='get', url=url, timeout=self.get_request_timeout()) if resp.status_code != 200: From 3d46ebc8406687bf4a66f50005325cad541e4ddf Mon Sep 17 00:00:00 2001 From: joaoamaral <7281460+joaopamaral@users.noreply.github.com> Date: Thu, 19 Jan 2023 13:18:11 -0300 Subject: [PATCH 16/52] Fixing next url --- tap_github/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_github/client.py b/tap_github/client.py index b7058367..d85d5cca 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -229,7 +229,7 @@ def authed_get_all_pages(self, source, url, headers={}, stream="", should_skip_4 response = self.authed_get(source, url, headers, stream, should_skip_404) yield response - next_url = response.links.get('next', None) + next_url = response.links.get('next', {}).get('url', None) def prepare_url(self, url): """ From 5b53c97058cee25aae5b433f2d1964463ecdefdc Mon Sep 17 00:00:00 2001 From: joaoamaral <7281460+joaopamaral@users.noreply.github.com> Date: Thu, 19 Jan 2023 13:22:11 -0300 Subject: [PATCH 17/52] Fixing next url --- tap_github/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_github/client.py b/tap_github/client.py index d85d5cca..b38286a1 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -226,7 +226,7 @@ def authed_get_all_pages(self, source, url, headers={}, stream="", should_skip_4 """ next_url = self.prepare_url(url) while next_url: - response = self.authed_get(source, url, headers, stream, should_skip_404) + response = self.authed_get(source, next_url, headers, stream, should_skip_404) yield response next_url = response.links.get('next', {}).get('url', None) From 35431dee310ed01ed276803984936b5006c4cb27 Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Tue, 27 Jun 2023 11:52:03 -0300 Subject: [PATCH 18/52] Removing _sdc_repository field --- tap_github/schemas/releases.json | 3 --- 1 file changed, 3 deletions(-) diff --git a/tap_github/schemas/releases.json b/tap_github/schemas/releases.json index b903a026..e836aded 100644 --- a/tap_github/schemas/releases.json +++ b/tap_github/schemas/releases.json @@ -2,9 +2,6 @@ "type": ["null", "object"], "additionalProperties": false, "properties": { - "_sdc_repository": { - "type": ["string"] - }, "id": { "type": ["null", "string"] }, From 5a1981314ad9f76986c92b825624da09701e9083 Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Tue, 27 Jun 2023 11:58:10 -0300 Subject: [PATCH 19/52] fixing discussion_url schema --- tap_github/schemas/releases.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tap_github/schemas/releases.json b/tap_github/schemas/releases.json index e836aded..736122d8 100644 --- a/tap_github/schemas/releases.json +++ b/tap_github/schemas/releases.json @@ -183,8 +183,7 @@ "format": "date-time" }, "discussion_url": { - "type": ["null", "string"], - "format": "date-time" + "type": ["null", "string"] } } } \ No newline at end of file From 510a5a7b9e0c53572bc0b4c17b0e131ab07c5610 Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Tue, 27 Jun 2023 11:59:13 -0300 Subject: [PATCH 20/52] Bump version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b6c06fef..151dbf9c 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='2.0.0', + version='2.0.3', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', From 452f3644989b07875b5c5ef3570a17f1aab773ff Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Tue, 27 Jun 2023 12:04:25 -0300 Subject: [PATCH 21/52] Test reverting _sdc_repository --- tap_github/schemas/releases.json | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tap_github/schemas/releases.json b/tap_github/schemas/releases.json index 736122d8..fc97ced2 100644 --- a/tap_github/schemas/releases.json +++ b/tap_github/schemas/releases.json @@ -2,6 +2,9 @@ "type": ["null", "object"], "additionalProperties": false, "properties": { + "_sdc_repository": { + "type": ["string"] + }, "id": { "type": ["null", "string"] }, From f310388a08c8bb28eba196f7882b21e355b983f6 Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Tue, 26 Sep 2023 11:46:33 -0300 Subject: [PATCH 22/52] Add retry attempt to BadCredentialsException --- setup.py | 2 +- tap_github/client.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 151dbf9c..e19b70d8 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='2.0.3', + version='2.0.4', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', diff --git a/tap_github/client.py b/tap_github/client.py index b38286a1..1b5e7db1 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -203,7 +203,8 @@ def set_auth_in_session(self): # pylint: disable=dangerous-default-value # During 'Timeout' error there is also possibility of 'ConnectionError', # hence added backoff for 'ConnectionError' too. - @backoff.on_exception(backoff.expo, (requests.Timeout, requests.ConnectionError, Server5xxError, TooManyRequests), max_tries=5, factor=2) + @backoff.on_exception(backoff.expo, (requests.Timeout, requests.ConnectionError, Server5xxError, TooManyRequests, + BadCredentialsException), max_tries=5, factor=2) def authed_get(self, source, url, headers={}, stream="", should_skip_404 = True): """ Call rest API and return the response in case of status code 200. From 9ef11c454bdfd2e10824a1aa237366d098ee69a5 Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Tue, 26 Sep 2023 14:19:31 -0300 Subject: [PATCH 23/52] Add TestBadCredentialsBackoff --- tests/unittests/test_bad_credentials.py | 34 +++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 tests/unittests/test_bad_credentials.py diff --git a/tests/unittests/test_bad_credentials.py b/tests/unittests/test_bad_credentials.py new file mode 100644 index 00000000..303cc004 --- /dev/null +++ b/tests/unittests/test_bad_credentials.py @@ -0,0 +1,34 @@ +import unittest +from unittest import mock +from tap_github.client import GithubClient, BadCredentialsException + +from tests.unittests.test_timeout import get_args, get_response + + +@mock.patch("tap_github.client.GithubClient.verify_access_for_repo", return_value = None) +@mock.patch("time.sleep") +@mock.patch("requests.Session.request") +@mock.patch("singer.utils.parse_args") +class TestBadCredentialsBackoff(unittest.TestCase): + """ + Test case to verify that we backoff for 5 times for Bad Credentials error + """ + + def test_backoff(self, mocked_parse_args, mocked_request, mocked_sleep, mock_verify_access): + """ + Test that tap retry timeout or connection error 5 times. + """ + # mock request and raise error + mocked_request.return_value = get_response(401, json={"message": "Bad credentials"}) + + mock_config = {"access_token": "access_token"} + # mock parse args + mocked_parse_args.return_value = get_args(mock_config) + test_client = GithubClient(mock_config) + + with self.assertRaises(BadCredentialsException): + test_client.authed_get("test_source", "") + + # verify that we backoff 5 times + self.assertEqual(5, mocked_request.call_count) + From f7d4aa0d408f9655e67eb90cbb7df0c10ec5c5fd Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Tue, 26 Sep 2023 14:21:06 -0300 Subject: [PATCH 24/52] Add TestBadCredentialsBackoff --- tests/unittests/test_bad_credentials.py | 34 ---------------------- tests/unittests/test_exception_handling.py | 2 +- 2 files changed, 1 insertion(+), 35 deletions(-) delete mode 100644 tests/unittests/test_bad_credentials.py diff --git a/tests/unittests/test_bad_credentials.py b/tests/unittests/test_bad_credentials.py deleted file mode 100644 index 303cc004..00000000 --- a/tests/unittests/test_bad_credentials.py +++ /dev/null @@ -1,34 +0,0 @@ -import unittest -from unittest import mock -from tap_github.client import GithubClient, BadCredentialsException - -from tests.unittests.test_timeout import get_args, get_response - - -@mock.patch("tap_github.client.GithubClient.verify_access_for_repo", return_value = None) -@mock.patch("time.sleep") -@mock.patch("requests.Session.request") -@mock.patch("singer.utils.parse_args") -class TestBadCredentialsBackoff(unittest.TestCase): - """ - Test case to verify that we backoff for 5 times for Bad Credentials error - """ - - def test_backoff(self, mocked_parse_args, mocked_request, mocked_sleep, mock_verify_access): - """ - Test that tap retry timeout or connection error 5 times. - """ - # mock request and raise error - mocked_request.return_value = get_response(401, json={"message": "Bad credentials"}) - - mock_config = {"access_token": "access_token"} - # mock parse args - mocked_parse_args.return_value = get_args(mock_config) - test_client = GithubClient(mock_config) - - with self.assertRaises(BadCredentialsException): - test_client.authed_get("test_source", "") - - # verify that we backoff 5 times - self.assertEqual(5, mocked_request.call_count) - diff --git a/tests/unittests/test_exception_handling.py b/tests/unittests/test_exception_handling.py index 8c381054..397bfecc 100644 --- a/tests/unittests/test_exception_handling.py +++ b/tests/unittests/test_exception_handling.py @@ -63,7 +63,7 @@ def test_json_decoder_error(self, mocked_parse_args, mocked_request, mock_verify @parameterized.expand([ [400, "The request is missing or has a bad parameter.", BadRequestException, '', {}, 1], - [401, "Invalid authorization credentials.", BadCredentialsException, '', {}, 1], + [401, "Invalid authorization credentials.", BadCredentialsException, '', {}, 5], [403, "User doesn't have permission to access the resource.", AuthException, '', {}, 1], [500, "An error has occurred at Github's end.", InternalServerError, '', {}, 5], [301, "The resource you are looking for is moved to another URL.", tap_github.client.MovedPermanentlyError, '', {}, 1], From 8623de7b427899e4e40132373c85053a919772df Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Tue, 26 Sep 2023 17:34:17 -0300 Subject: [PATCH 25/52] Setting BadCredentialsException max_tries to 3 --- tap_github/client.py | 5 +++-- tests/unittests/test_exception_handling.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tap_github/client.py b/tap_github/client.py index 1b5e7db1..27e2f384 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -203,8 +203,9 @@ def set_auth_in_session(self): # pylint: disable=dangerous-default-value # During 'Timeout' error there is also possibility of 'ConnectionError', # hence added backoff for 'ConnectionError' too. - @backoff.on_exception(backoff.expo, (requests.Timeout, requests.ConnectionError, Server5xxError, TooManyRequests, - BadCredentialsException), max_tries=5, factor=2) + @backoff.on_exception(backoff.expo, (requests.Timeout, requests.ConnectionError, Server5xxError, TooManyRequests), + max_tries=5, factor=2) + @backoff.on_exception(backoff.expo, (BadCredentialsException, ), max_tries=3, factor=2) def authed_get(self, source, url, headers={}, stream="", should_skip_404 = True): """ Call rest API and return the response in case of status code 200. diff --git a/tests/unittests/test_exception_handling.py b/tests/unittests/test_exception_handling.py index 397bfecc..05e48851 100644 --- a/tests/unittests/test_exception_handling.py +++ b/tests/unittests/test_exception_handling.py @@ -63,7 +63,7 @@ def test_json_decoder_error(self, mocked_parse_args, mocked_request, mock_verify @parameterized.expand([ [400, "The request is missing or has a bad parameter.", BadRequestException, '', {}, 1], - [401, "Invalid authorization credentials.", BadCredentialsException, '', {}, 5], + [401, "Invalid authorization credentials.", BadCredentialsException, '', {}, 3], [403, "User doesn't have permission to access the resource.", AuthException, '', {}, 1], [500, "An error has occurred at Github's end.", InternalServerError, '', {}, 5], [301, "The resource you are looking for is moved to another URL.", tap_github.client.MovedPermanentlyError, '', {}, 1], From dd73f3a4ab534109abda0ffca338930c9058f598 Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Thu, 23 Nov 2023 11:27:06 -0300 Subject: [PATCH 26/52] Not skip 404 on verify repo access --- tap_github/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_github/client.py b/tap_github/client.py index 27e2f384..92c02f35 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -247,7 +247,7 @@ def verify_repo_access(self, url_for_repo, repo): Call rest API to verify that the user has sufficient permissions to access this repository. """ try: - self.authed_get("verifying repository access", url_for_repo) + self.authed_get("verifying repository access", url_for_repo, should_skip_404=False) except NotFoundException: # Throwing user-friendly error message as it checks token access message = "HTTP-error-code: 404, Error: Please check the repository name \'{}\' or you do not have sufficient permissions to access this repository.".format(repo) From c2649059acf23a2649741d83c1afe62ba8da26fb Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Thu, 23 Nov 2023 12:56:00 -0300 Subject: [PATCH 27/52] Add test --- tests/unittests/test_verify_access.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/unittests/test_verify_access.py b/tests/unittests/test_verify_access.py index bdd93209..700ae4ed 100644 --- a/tests/unittests/test_verify_access.py +++ b/tests/unittests/test_verify_access.py @@ -60,3 +60,15 @@ def test_repo_bad_creds(self, mocked_parse_args, mocked_request, mock_verify_acc # Verify error with proper message self.assertEqual(str(e.exception), "HTTP-error-code: 401, Error: {}".format(json)) + + def test_repo_no_permission(self, mocked_parse_args, mocked_request, mock_verify_access): + """Verify if 404 error arises""" + test_client = GithubClient(self.config) + json = {"message": "Please check the repository name 'repo' or you do not have sufficient permissions to access this repository.", "documentation_url": "https://docs.github.com/"} + mocked_request.return_value = get_response(404, json, True) + + with self.assertRaises(tap_github.client.NotFoundException) as e: + test_client.verify_repo_access("", "repo") + + # Verify error with proper message + self.assertEqual(str(e.exception), "HTTP-error-code: 404, Error: {}".format(json['message'])) From cf95950c703376fe0a9dc5539ce3261f38afdaeb Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Thu, 23 Nov 2023 13:46:27 -0300 Subject: [PATCH 28/52] Bump version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e19b70d8..79890701 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='2.0.4', + version='2.0.5', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', From bf330e30579e397e8cde4fe894ff0453ca5c826c Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Fri, 23 Aug 2024 15:26:33 -0300 Subject: [PATCH 29/52] Add open issues stream --- tap_github/schemas/open_issues.json | 276 ++++++++++++++++++++++++++++ tap_github/streams.py | 13 ++ 2 files changed, 289 insertions(+) create mode 100644 tap_github/schemas/open_issues.json diff --git a/tap_github/schemas/open_issues.json b/tap_github/schemas/open_issues.json new file mode 100644 index 00000000..93365708 --- /dev/null +++ b/tap_github/schemas/open_issues.json @@ -0,0 +1,276 @@ +{ + "properties": { + "state": { + "type": ["null", "string"] + }, + "state_reason": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "labels": { + "type": ["null", "array"], + "items": { + "type": ["null", "object"], + "properties": { + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "description": { + "type": ["null", "string"] + }, + "color": { + "type": ["null", "string"] + }, + "default": { + "type": ["null", "boolean"] + } + } + } + }, + "repository_url": { + "type": ["null", "string"] + }, + "number": { + "type": ["null", "integer"] + }, + "closed_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "labels_url": { + "type": ["null", "string"] + }, + "title": { + "type": ["null", "string"] + }, + "assignee": { + "$ref": "shared/user.json#/" + }, + "assignees": { + "type": ["null", "array"], + "items": { + "$ref": "shared/user.json#/" + } + }, + "milestone": { + "type": ["null", "object"], + "properties": { + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "labels_url": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "number": { + "type": ["null", "integer"] + }, + "state": { + "type": ["null", "string"] + }, + "title": { + "type": ["null", "string"] + }, + "description": { + "type": ["null", "string"] + }, + "creator": { + "$ref": "shared/user.json#/" + }, + "open_issues": { + "type": ["null", "integer"] + }, + "closed_issues": { + "type": ["null", "integer"] + }, + "created_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "updated_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "closed_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "due_on": { + "type": ["null", "string"], + "format": "date-time" + } + } + }, + "reactions": { + "$ref": "shared/reactions.json#/" + }, + "active_lock_reason": { + "type": ["null", "string"] + }, + "body_html": { + "type": ["null", "string"] + }, + "performed_via_github_app": { + "$ref": "shared/performed_via_github_app.json#/" + }, + "timeline_url": { + "type": ["null", "string"] + }, + "closed_by": { + "type": ["null", "object"], + "properties": { + "name": { + "type": ["null", "string"] + }, + "email": { + "type": ["null", "string"] + }, + "login": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "avatar_url": { + "type": ["null", "string"] + }, + "gravatar_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "followers_url": { + "type": ["null", "string"] + }, + "following_url": { + "type": ["null", "string"] + }, + "gists_url": { + "type": ["null", "string"] + }, + "starred_url": { + "type": ["null", "string"] + }, + "subscriptions_url": { + "type": ["null", "string"] + }, + "organizations_url": { + "type": ["null", "string"] + }, + "repos_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "received_events_url": { + "type": ["null", "string"] + }, + "type": { + "type": ["null", "string"] + }, + "site_admin": { + "type": ["null", "boolean"] + }, + "starred_at": { + "type": ["null", "string"] + } + } + }, + "updated_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "html_url": { + "type": ["null", "string"] + }, + "author_association": { + "type": ["null", "string"] + }, + "locked": { + "type": ["null", "boolean"] + }, + "events_url": { + "type": ["null", "string"] + }, + "pull_request": { + "properties": { + "diff_url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "patch_url": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "merged_at": { + "type": ["null", "string"], + "format": "date-time" + } + }, + "type": ["null", "object"] + }, + "node_id": { + "type": ["null", "string"] + }, + "body": { + "type": ["null", "string"] + }, + "comments": { + "type": ["null", "integer"] + }, + "created_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "_sdc_repository": { + "type": ["string"] + }, + "user": { + "$ref": "shared/user.json#/" + }, + "id": { + "type": ["null", "integer"] + }, + "comments_url": { + "type": ["null", "string"] + }, + "body_text": { + "type": ["null", "string"] + }, + "draft": { + "type": ["null", "boolean"] + } + }, + "type": ["null", "object"] +} diff --git a/tap_github/streams.py b/tap_github/streams.py index 278dd05a..3a75a003 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -648,6 +648,18 @@ class Issues(IncrementalOrderedStream): filter_param = True path = "issues?state=all&sort=updated&direction=desc" + +class OpenIssues(FullTableStream): + ''' + https://docs.github.com/en/rest/issues/issues#list-repository-issues + ''' + tap_stream_id = "open_issues" + replication_method = "FULL_TABLE" + key_properties = ["id"] + filter_param = False + path = "issues?state=open" + + class Assignees(FullTableStream): ''' https://docs.github.com/en/rest/issues/assignees#list-assignees @@ -753,6 +765,7 @@ def add_fields_at_1st_level(self, record, parent_record = None): "events": Events, "commit_comments": CommitComments, "issue_milestones": IssueMilestones, + "open_issues": OpenIssues, "projects": Projects, "project_columns": ProjectColumns, "project_cards": ProjectCards, From c40e79379eb5de1869c32455583212850a029c9c Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Fri, 23 Aug 2024 16:26:06 -0300 Subject: [PATCH 30/52] bump version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 79890701..2271e6ea 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='2.0.5', + version='2.0.6', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', From 8ebcb494368690682636d0a8f77150ecabba8a11 Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Sat, 28 Sep 2024 06:18:31 -0300 Subject: [PATCH 31/52] Remove exception if no `X-RateLimit-Remaining` is not found in the header in the header --- tap_github/client.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tap_github/client.py b/tap_github/client.py index 92c02f35..6686fc30 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -160,10 +160,6 @@ def rate_throttling(response, max_sleep_seconds, min_remain_rate_limit): LOGGER.info("API rate limit exceeded. Tap will retry the data collection after %s seconds.", seconds_to_sleep) time.sleep(seconds_to_sleep) - else: - # Raise an exception if `X-RateLimit-Remaining` is not found in the header. - # API does include this key header if provided base URL is not a valid github custom domain. - raise GithubException("The API call using the specified base url was unsuccessful. Please double-check the provided base URL.") class GithubClient: """ From eedad3d218dc6bb3a1df0d6c801dbaa4f470123b Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Sat, 28 Sep 2024 06:22:21 -0300 Subject: [PATCH 32/52] Remove exception if no `X-RateLimit-Remaining` is not found in the header in the header --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 2271e6ea..3be29e12 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='2.0.6', + version='2.0.7', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', From e5933ecd14bd112b7decb42eb83a715fbb419849 Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Mon, 7 Oct 2024 09:19:17 -0300 Subject: [PATCH 33/52] Revert "Remove the exception if no X-RateLimit-Remaining is found in the header" --- setup.py | 2 +- tap_github/client.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 3be29e12..2271e6ea 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='2.0.7', + version='2.0.6', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', diff --git a/tap_github/client.py b/tap_github/client.py index 6686fc30..92c02f35 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -160,6 +160,10 @@ def rate_throttling(response, max_sleep_seconds, min_remain_rate_limit): LOGGER.info("API rate limit exceeded. Tap will retry the data collection after %s seconds.", seconds_to_sleep) time.sleep(seconds_to_sleep) + else: + # Raise an exception if `X-RateLimit-Remaining` is not found in the header. + # API does include this key header if provided base URL is not a valid github custom domain. + raise GithubException("The API call using the specified base url was unsuccessful. Please double-check the provided base URL.") class GithubClient: """ From cadb36551b38fd56d26df70f498c366c8ded3896 Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Mon, 7 Oct 2024 09:20:48 -0300 Subject: [PATCH 34/52] Bump correct version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 2271e6ea..1236da87 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='2.0.6', + version='2.0.8', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', From c91b035c13ac77b4ee112a7909350b9b9a472a16 Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Mon, 13 Jan 2025 18:37:43 -0300 Subject: [PATCH 35/52] Add Forked repos stream --- tap_github/client.py | 4 +- tap_github/schemas/repo_forked_compares.json | 74 ++++++ tap_github/schemas/repo_forked_parents.json | 260 +++++++++++++++++++ tap_github/schemas/repos.json | 248 ++++++++++++++++++ tap_github/schemas/shared/repos.json | 248 ++++++++++++++++++ tap_github/streams.py | 140 +++++++++- tap_github/sync.py | 2 +- 7 files changed, 969 insertions(+), 7 deletions(-) create mode 100644 tap_github/schemas/repo_forked_compares.json create mode 100644 tap_github/schemas/repo_forked_parents.json create mode 100644 tap_github/schemas/repos.json create mode 100644 tap_github/schemas/shared/repos.json diff --git a/tap_github/client.py b/tap_github/client.py index 92c02f35..7e524a16 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -222,7 +222,7 @@ def authed_get(self, source, url, headers={}, stream="", should_skip_404 = True) resp._content = b'{}' # pylint: disable=protected-access return resp - def authed_get_all_pages(self, source, url, headers={}, stream="", should_skip_404 = True): + def authed_get_all_pages(self, source, url, headers={}, stream="", should_skip_404 = True, skip_pagination=True): """ Fetch all pages of records and return them. """ @@ -231,6 +231,8 @@ def authed_get_all_pages(self, source, url, headers={}, stream="", should_skip_4 response = self.authed_get(source, next_url, headers, stream, should_skip_404) yield response + if skip_pagination: + break next_url = response.links.get('next', {}).get('url', None) def prepare_url(self, url): diff --git a/tap_github/schemas/repo_forked_compares.json b/tap_github/schemas/repo_forked_compares.json new file mode 100644 index 00000000..0b756fbf --- /dev/null +++ b/tap_github/schemas/repo_forked_compares.json @@ -0,0 +1,74 @@ +{ + "type": ["null", "object"], + "properties": { + "_sdc_repository": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "full_name": { + "type": ["null", "string"] + }, + "owner_login": { + "type": ["null", "string"] + }, + "default_branch": { + "type": ["null", "string"] + }, + "fork_name": { + "type": ["null", "string"] + }, + "fork_owner_login": { + "type": ["null", "string"] + }, + "fork_default_branch": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "permalink_url": { + "type": ["null", "string"] + }, + "diff_url": { + "type": ["null", "string"] + }, + "patch_url": { + "type": ["null", "string"] + }, + "base_commit": { + "type": ["null", "object"] + }, + "merge_base_commit": { + "type": ["null", "object"] + }, + "status": { + "type": ["null", "string"] + }, + "ahead_by": { + "type": ["null", "number"] + }, + "behind_by": { + "type": ["null", "number"] + }, + "total_commits": { + "type": ["null", "number"] + }, + "commits": { + "type": ["null", "array"], + "items": { + "type": ["null", "object"] + } + }, + "files": { + "type": ["null", "array"], + "items": { + "type": ["null", "object"] + } + } + } +} diff --git a/tap_github/schemas/repo_forked_parents.json b/tap_github/schemas/repo_forked_parents.json new file mode 100644 index 00000000..31ff6129 --- /dev/null +++ b/tap_github/schemas/repo_forked_parents.json @@ -0,0 +1,260 @@ +{ + "type": ["null", "object"], + "properties": { + "_sdc_repository": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "number"] + }, + "node_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "full_name": { + "type": ["null", "string"] + }, + "private": { + "type": ["null", "boolean"] + }, + "owner": { + "$ref": "shared/user.json#/" + }, + "description": { + "type": ["null", "string"] + }, + "fork": { + "type": ["null", "boolean"] + }, + "fork_name": { + "type": ["null", "string"] + }, + "fork_owner_login": { + "type": ["null", "string"] + }, + "fork_default_branch": { + "type": ["null", "string"] + }, + "keys_url": { + "type": ["null", "string"] + }, + "collaborators_url": { + "type": ["null", "string"] + }, + "teams_url": { + "type": ["null", "string"] + }, + "hooks_url": { + "type": ["null", "string"] + }, + "issue_events_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "assignees_url": { + "type": ["null", "string"] + }, + "branches_url": { + "type": ["null", "string"] + }, + "tags_url": { + "type": ["null", "string"] + }, + "blobs_url": { + "type": ["null", "string"] + }, + "git_tags_url": { + "type": ["null", "string"] + }, + "git_refs_url": { + "type": ["null", "string"] + }, + "trees_url": { + "type": ["null", "string"] + }, + "statuses_url": { + "type": ["null", "string"] + }, + "languages_url": { + "type": ["null", "string"] + }, + "stargazers_url": { + "type": ["null", "string"] + }, + "contributors_url": { + "type": ["null", "string"] + }, + "subscribers_url": { + "type": ["null", "string"] + }, + "subscription_url": { + "type": ["null", "string"] + }, + "commits_url": { + "type": ["null", "string"] + }, + "git_commits_url": { + "type": ["null", "string"] + }, + "comments_url": { + "type": ["null", "string"] + }, + "issue_comment_url": { + "type": ["null", "string"] + }, + "contents_url": { + "type": ["null", "string"] + }, + "compare_url": { + "type": ["null", "string"] + }, + "merges_url": { + "type": ["null", "string"] + }, + "archive_url": { + "type": ["null", "string"] + }, + "downloads_url": { + "type": ["null", "string"] + }, + "issues_url": { + "type": ["null", "string"] + }, + "pulls_url": { + "type": ["null", "string"] + }, + "milestones_url": { + "type": ["null", "string"] + }, + "notifications_url": { + "type": ["null", "string"] + }, + "labels_url": { + "type": ["null", "string"] + }, + "releases_url": { + "type": ["null", "string"] + }, + "deployments_url": { + "type": ["null", "string"] + }, + "created_at": { + "type": ["null", "string"] + }, + "updated_at": { + "type": ["null", "string"] + }, + "pushed_at": { + "type": ["null", "string"] + }, + "git_url": { + "type": ["null", "string"] + }, + "ssh_url": { + "type": ["null", "string"] + }, + "clone_url": { + "type": ["null", "string"] + }, + "svn_url": { + "type": ["null", "string"] + }, + "homepage": { + "type": ["null", "string"] + }, + "size": { + "type": ["null", "number"] + }, + "stargazers_count": { + "type": ["null", "number"] + }, + "watchers_count": { + "type": ["null", "number"] + }, + "language": { + "type": ["null", "string"] + }, + "has_issues": { + "type": ["null", "boolean"] + }, + "has_projects": { + "type": ["null", "boolean"] + }, + "has_downloads": { + "type": ["null", "boolean"] + }, + "has_wiki": { + "type": ["null", "boolean"] + }, + "has_pages": { + "type": ["null", "boolean"] + }, + "has_discussions": { + "type": ["null", "boolean"] + }, + "forks_count": { + "type": ["null", "number"] + }, + "mirror_url": { + "type": ["null", "string"] + }, + "archived": { + "type": ["null", "boolean"] + }, + "disabled": { + "type": ["null", "boolean"] + }, + "open_issues_count": { + "type": ["null", "number"] + }, + "license": { + "type": ["null", "object"] + }, + "allow_forking": { + "type": ["null", "boolean"] + }, + "is_template": { + "type": ["null", "boolean"] + }, + "web_commit_signoff_required": { + "type": ["null", "boolean"] + }, + "topics": { + "type": ["null", "array"], + "items": { + "type": ["string"] + } + }, + "visibility": { + "type": ["null", "string"] + }, + "forks": { + "type": ["null", "number"] + }, + "open_issues": { + "type": ["null", "number"] + }, + "watchers": { + "type": ["null", "number"] + }, + "default_branch": { + "type": ["null", "string"] + }, + "permissions": { + "$ref": "shared/pull_permissions.json#/" + }, + "parent": { + "$ref": "shared/repos.json#/" + } + } +} diff --git a/tap_github/schemas/repos.json b/tap_github/schemas/repos.json new file mode 100644 index 00000000..7954ed2e --- /dev/null +++ b/tap_github/schemas/repos.json @@ -0,0 +1,248 @@ +{ + "type": ["null", "object"], + "properties": { + "_sdc_repository": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "number"] + }, + "node_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "full_name": { + "type": ["null", "string"] + }, + "private": { + "type": ["null", "boolean"] + }, + "owner": { + "$ref": "shared/user.json#/" + }, + "description": { + "type": ["null", "string"] + }, + "fork": { + "type": ["null", "boolean"] + }, + "keys_url": { + "type": ["null", "string"] + }, + "collaborators_url": { + "type": ["null", "string"] + }, + "teams_url": { + "type": ["null", "string"] + }, + "hooks_url": { + "type": ["null", "string"] + }, + "issue_events_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "assignees_url": { + "type": ["null", "string"] + }, + "branches_url": { + "type": ["null", "string"] + }, + "tags_url": { + "type": ["null", "string"] + }, + "blobs_url": { + "type": ["null", "string"] + }, + "git_tags_url": { + "type": ["null", "string"] + }, + "git_refs_url": { + "type": ["null", "string"] + }, + "trees_url": { + "type": ["null", "string"] + }, + "statuses_url": { + "type": ["null", "string"] + }, + "languages_url": { + "type": ["null", "string"] + }, + "stargazers_url": { + "type": ["null", "string"] + }, + "contributors_url": { + "type": ["null", "string"] + }, + "subscribers_url": { + "type": ["null", "string"] + }, + "subscription_url": { + "type": ["null", "string"] + }, + "commits_url": { + "type": ["null", "string"] + }, + "git_commits_url": { + "type": ["null", "string"] + }, + "comments_url": { + "type": ["null", "string"] + }, + "issue_comment_url": { + "type": ["null", "string"] + }, + "contents_url": { + "type": ["null", "string"] + }, + "compare_url": { + "type": ["null", "string"] + }, + "merges_url": { + "type": ["null", "string"] + }, + "archive_url": { + "type": ["null", "string"] + }, + "downloads_url": { + "type": ["null", "string"] + }, + "issues_url": { + "type": ["null", "string"] + }, + "pulls_url": { + "type": ["null", "string"] + }, + "milestones_url": { + "type": ["null", "string"] + }, + "notifications_url": { + "type": ["null", "string"] + }, + "labels_url": { + "type": ["null", "string"] + }, + "releases_url": { + "type": ["null", "string"] + }, + "deployments_url": { + "type": ["null", "string"] + }, + "created_at": { + "type": ["null", "string"] + }, + "updated_at": { + "type": ["null", "string"] + }, + "pushed_at": { + "type": ["null", "string"] + }, + "git_url": { + "type": ["null", "string"] + }, + "ssh_url": { + "type": ["null", "string"] + }, + "clone_url": { + "type": ["null", "string"] + }, + "svn_url": { + "type": ["null", "string"] + }, + "homepage": { + "type": ["null", "string"] + }, + "size": { + "type": ["null", "number"] + }, + "stargazers_count": { + "type": ["null", "number"] + }, + "watchers_count": { + "type": ["null", "number"] + }, + "language": { + "type": ["null", "string"] + }, + "has_issues": { + "type": ["null", "boolean"] + }, + "has_projects": { + "type": ["null", "boolean"] + }, + "has_downloads": { + "type": ["null", "boolean"] + }, + "has_wiki": { + "type": ["null", "boolean"] + }, + "has_pages": { + "type": ["null", "boolean"] + }, + "has_discussions": { + "type": ["null", "boolean"] + }, + "forks_count": { + "type": ["null", "number"] + }, + "mirror_url": { + "type": ["null", "string"] + }, + "archived": { + "type": ["null", "boolean"] + }, + "disabled": { + "type": ["null", "boolean"] + }, + "open_issues_count": { + "type": ["null", "number"] + }, + "license": { + "type": ["null", "object"] + }, + "allow_forking": { + "type": ["null", "boolean"] + }, + "is_template": { + "type": ["null", "boolean"] + }, + "web_commit_signoff_required": { + "type": ["null", "boolean"] + }, + "topics": { + "type": ["null", "array"], + "items": { + "type": ["string"] + } + }, + "visibility": { + "type": ["null", "string"] + }, + "forks": { + "type": ["null", "number"] + }, + "open_issues": { + "type": ["null", "number"] + }, + "watchers": { + "type": ["null", "number"] + }, + "default_branch": { + "type": ["null", "string"] + }, + "permissions": { + "$ref": "shared/pull_permissions.json#/" + } + } +} diff --git a/tap_github/schemas/shared/repos.json b/tap_github/schemas/shared/repos.json new file mode 100644 index 00000000..7954ed2e --- /dev/null +++ b/tap_github/schemas/shared/repos.json @@ -0,0 +1,248 @@ +{ + "type": ["null", "object"], + "properties": { + "_sdc_repository": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "number"] + }, + "node_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "full_name": { + "type": ["null", "string"] + }, + "private": { + "type": ["null", "boolean"] + }, + "owner": { + "$ref": "shared/user.json#/" + }, + "description": { + "type": ["null", "string"] + }, + "fork": { + "type": ["null", "boolean"] + }, + "keys_url": { + "type": ["null", "string"] + }, + "collaborators_url": { + "type": ["null", "string"] + }, + "teams_url": { + "type": ["null", "string"] + }, + "hooks_url": { + "type": ["null", "string"] + }, + "issue_events_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "assignees_url": { + "type": ["null", "string"] + }, + "branches_url": { + "type": ["null", "string"] + }, + "tags_url": { + "type": ["null", "string"] + }, + "blobs_url": { + "type": ["null", "string"] + }, + "git_tags_url": { + "type": ["null", "string"] + }, + "git_refs_url": { + "type": ["null", "string"] + }, + "trees_url": { + "type": ["null", "string"] + }, + "statuses_url": { + "type": ["null", "string"] + }, + "languages_url": { + "type": ["null", "string"] + }, + "stargazers_url": { + "type": ["null", "string"] + }, + "contributors_url": { + "type": ["null", "string"] + }, + "subscribers_url": { + "type": ["null", "string"] + }, + "subscription_url": { + "type": ["null", "string"] + }, + "commits_url": { + "type": ["null", "string"] + }, + "git_commits_url": { + "type": ["null", "string"] + }, + "comments_url": { + "type": ["null", "string"] + }, + "issue_comment_url": { + "type": ["null", "string"] + }, + "contents_url": { + "type": ["null", "string"] + }, + "compare_url": { + "type": ["null", "string"] + }, + "merges_url": { + "type": ["null", "string"] + }, + "archive_url": { + "type": ["null", "string"] + }, + "downloads_url": { + "type": ["null", "string"] + }, + "issues_url": { + "type": ["null", "string"] + }, + "pulls_url": { + "type": ["null", "string"] + }, + "milestones_url": { + "type": ["null", "string"] + }, + "notifications_url": { + "type": ["null", "string"] + }, + "labels_url": { + "type": ["null", "string"] + }, + "releases_url": { + "type": ["null", "string"] + }, + "deployments_url": { + "type": ["null", "string"] + }, + "created_at": { + "type": ["null", "string"] + }, + "updated_at": { + "type": ["null", "string"] + }, + "pushed_at": { + "type": ["null", "string"] + }, + "git_url": { + "type": ["null", "string"] + }, + "ssh_url": { + "type": ["null", "string"] + }, + "clone_url": { + "type": ["null", "string"] + }, + "svn_url": { + "type": ["null", "string"] + }, + "homepage": { + "type": ["null", "string"] + }, + "size": { + "type": ["null", "number"] + }, + "stargazers_count": { + "type": ["null", "number"] + }, + "watchers_count": { + "type": ["null", "number"] + }, + "language": { + "type": ["null", "string"] + }, + "has_issues": { + "type": ["null", "boolean"] + }, + "has_projects": { + "type": ["null", "boolean"] + }, + "has_downloads": { + "type": ["null", "boolean"] + }, + "has_wiki": { + "type": ["null", "boolean"] + }, + "has_pages": { + "type": ["null", "boolean"] + }, + "has_discussions": { + "type": ["null", "boolean"] + }, + "forks_count": { + "type": ["null", "number"] + }, + "mirror_url": { + "type": ["null", "string"] + }, + "archived": { + "type": ["null", "boolean"] + }, + "disabled": { + "type": ["null", "boolean"] + }, + "open_issues_count": { + "type": ["null", "number"] + }, + "license": { + "type": ["null", "object"] + }, + "allow_forking": { + "type": ["null", "boolean"] + }, + "is_template": { + "type": ["null", "boolean"] + }, + "web_commit_signoff_required": { + "type": ["null", "boolean"] + }, + "topics": { + "type": ["null", "array"], + "items": { + "type": ["string"] + } + }, + "visibility": { + "type": ["null", "string"] + }, + "forks": { + "type": ["null", "number"] + }, + "open_issues": { + "type": ["null", "number"] + }, + "watchers": { + "type": ["null", "number"] + }, + "default_branch": { + "type": ["null", "string"] + }, + "permissions": { + "$ref": "shared/pull_permissions.json#/" + } + } +} diff --git a/tap_github/streams.py b/tap_github/streams.py index 3a75a003..828b556f 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -2,6 +2,8 @@ import singer from singer import (metrics, bookmarks, metadata) +from tap_github.client import UnprocessableError + LOGGER = singer.get_logger() DATE_FORMAT = '%Y-%m-%dT%H:%M:%SZ' @@ -68,6 +70,7 @@ class Stream: use_repository = False headers = {'Accept': '*/*'} parent = None + skip_pagination = True def build_url(self, base_url, repo_path, bookmark): """ @@ -156,7 +159,8 @@ def get_child_records(self, for response in client.authed_get_all_pages( child_object.tap_stream_id, child_full_url, - stream = child_object.tap_stream_id + stream = child_object.tap_stream_id, + skip_pagination = child_object.skip_pagination ): records = response.json() extraction_time = singer.utils.now() @@ -196,6 +200,17 @@ def get_child_records(self, singer.write_record(child_object.tap_stream_id, rec, time_extracted=extraction_time) + # Loop thru each child and nested child in the parent and fetch all the child records. + for nested_child in child_object.children: + if nested_child in stream_to_sync: + # Collect id of child record to pass in the API of its sub-child. + child_id = tuple(records.get(key) for key in STREAMS[nested_child]().id_keys) + # Here, grand_parent_id is the id of 1st level parent(main parent) which is required to + # pass in the API of the current child's sub-child. + child_object.get_child_records(client, catalog, nested_child, child_id, repo_path, state, start_date, + bookmark_dttm, stream_to_sync, selected_stream_ids, grand_parent_id, + records) + # pylint: disable=unnecessary-pass def add_fields_at_1st_level(self, record, parent_record = None): """ @@ -227,7 +242,8 @@ def sync_endpoint(self, self.tap_stream_id, full_url, self.headers, - stream = self.tap_stream_id + stream = self.tap_stream_id, + skip_pagination = self.skip_pagination ): records = response.json() extraction_time = singer.utils.now() @@ -298,7 +314,8 @@ def sync_endpoint(self, self.tap_stream_id, full_url, self.headers, - stream = self.tap_stream_id + stream = self.tap_stream_id, + skip_pagination = self.skip_pagination ): records = response.json() extraction_time = singer.utils.now() @@ -384,7 +401,8 @@ def sync_endpoint(self, for response in client.authed_get_all_pages( self.tap_stream_id, full_url, - stream = self.tap_stream_id + stream = self.tap_stream_id, + skip_pagination = self.skip_pagination ): records = response.json() extraction_time = singer.utils.now() @@ -753,6 +771,115 @@ def add_fields_at_1st_level(self, record, parent_record = None): record['user_id'] = record['user']['id'] +class Repos(FullTableStream): + ''' + https://docs.github.com/en/rest/repos/repos#list-organization-repositories + ''' + tap_stream_id = "repos" + replication_method = "FULL_TABLE" + key_properties = ["id"] + use_organization = True + path = "orgs/{}/repos?per_page=100" + has_children = True + children= ["repo_forked_parents"] + pk_child_fields = ['_sdc_repository'] + + def get_child_records(self, + client, + catalog, + child_stream, + grand_parent_id, + repo_path, + state, + start_date, + bookmark_dttm, + stream_to_sync, + selected_stream_ids, + parent_id=None, + parent_record=None): + if child_stream == 'repo_forked_parents' and parent_record and not parent_record.get('fork'): + # Skip fetching child records for non-forked repositories. + return + super().get_child_records(client, catalog, child_stream, grand_parent_id, repo_path, state, start_date, + bookmark_dttm, stream_to_sync, selected_stream_ids, parent_id, parent_record) + + +class RepoForkedParents(FullTableStream): + ''' + Get parent repositories of a forked repository. + https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#get-a-repository + ''' + tap_stream_id = "repo_forked_parents" + replication_method = "FULL_TABLE" + key_properties = ["_sdc_repository"] + use_organization = True + path = "repos/{}/{}" + parent = 'repos' + id_keys = ['name', 'default_branch'] + has_children = True + children = ["repo_forked_compares"] + pk_child_fields = ['_sdc_repository'] + + def add_fields_at_1st_level(self, record, parent_record = None): + """ + Add fields in the record explicitly at the 1st level of JSON. + """ + record['fork_name'] = record['parent']['owner']['login'] + record['fork_owner_login'] = record['parent']['owner']['login'] + record['fork_default_branch'] = record['parent']['default_branch'] + + def get_child_records(self, + client, + catalog, + child_stream, + grand_parent_id, + repo_path, + state, + start_date, + bookmark_dttm, + stream_to_sync, + selected_stream_ids, + parent_id=None, + parent_record=None): + try: + super().get_child_records(client, catalog, child_stream, grand_parent_id, repo_path, state, start_date, + bookmark_dttm, stream_to_sync, selected_stream_ids, parent_id, parent_record) + except UnprocessableError as e: + error_message = str(e) + if "HTTP-error-code: 422" in error_message and "Sorry, this diff is taking too long to generate" in error_message: + LOGGER.warning(f'Can\'t compare the forked repository ({selected_stream_ids}) with the parent repository. Error: {error_message}') + else: + raise e + + + +class RepoForkedCompares(FullTableStream): + ''' + Compare the repo with the default branch of the parent repository. + https://docs.github.com/en/rest/commits/commits?apiVersion=2022-11-28#compare-two-commits + ''' + tap_stream_id = "repo_forked_compares" + replication_method = "FULL_TABLE" + key_properties = ["_sdc_repository"] + use_organization = True + path = "repos/{}/{}/compare/{}...{}:{}" + parent = 'repo_forked_parents' + id_keys = ['fork_owner_login', 'fork_default_branch'] + skip_pagination = False # No pagination required for this stream. + + def add_fields_at_1st_level(self, record, parent_record = None): + """ + Add fields in the record explicitly at the 1st level of JSON. + """ + record['name'] = parent_record['name'] + record['owner_login'] = parent_record['owner']['login'] + record['full_name'] = parent_record['full_name'] + record['default_branch'] = parent_record['default_branch'] + record['fork_owner_login'] = parent_record['fork_owner_login'] + record['fork_name'] = parent_record['fork_name'] + record['fork_default_branch'] = parent_record['fork_default_branch'] + + # Dictionary of the stream classes STREAMS = { "commits": Commits, @@ -777,5 +904,8 @@ def add_fields_at_1st_level(self, record, parent_record = None): "team_members": TeamMembers, "team_memberships": TeamMemberships, "collaborators": Collaborators, - "stargazers": StarGazers + "stargazers": StarGazers, + "repos": Repos, + "repo_forked_parents": RepoForkedParents, + "repo_forked_compares": RepoForkedCompares } diff --git a/tap_github/sync.py b/tap_github/sync.py index a83610ad..d0e4936b 100644 --- a/tap_github/sync.py +++ b/tap_github/sync.py @@ -4,7 +4,7 @@ from tap_github.streams import STREAMS LOGGER = singer.get_logger() -STREAM_TO_SYNC_FOR_ORGS = ['teams', 'team_members', 'team_memberships'] +STREAM_TO_SYNC_FOR_ORGS = ['teams', 'team_members', 'team_memberships', 'repos', 'repo_forked_parents', 'repo_forked_compares'] def get_selected_streams(catalog): ''' From 55bbe381543152c83750bb3417324156a91ddbeb Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Tue, 14 Jan 2025 15:02:37 -0300 Subject: [PATCH 36/52] Fix --- tap_github/client.py | 2 +- tap_github/streams.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tap_github/client.py b/tap_github/client.py index 7e524a16..370fea9c 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -222,7 +222,7 @@ def authed_get(self, source, url, headers={}, stream="", should_skip_404 = True) resp._content = b'{}' # pylint: disable=protected-access return resp - def authed_get_all_pages(self, source, url, headers={}, stream="", should_skip_404 = True, skip_pagination=True): + def authed_get_all_pages(self, source, url, headers={}, stream="", should_skip_404 = True, skip_pagination=False): """ Fetch all pages of records and return them. """ diff --git a/tap_github/streams.py b/tap_github/streams.py index 828b556f..b1b00073 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -70,7 +70,7 @@ class Stream: use_repository = False headers = {'Accept': '*/*'} parent = None - skip_pagination = True + skip_pagination = False def build_url(self, base_url, repo_path, bookmark): """ @@ -782,7 +782,7 @@ class Repos(FullTableStream): path = "orgs/{}/repos?per_page=100" has_children = True children= ["repo_forked_parents"] - pk_child_fields = ['_sdc_repository'] + pk_child_fields = ['id'] def get_child_records(self, client, @@ -811,20 +811,20 @@ class RepoForkedParents(FullTableStream): ''' tap_stream_id = "repo_forked_parents" replication_method = "FULL_TABLE" - key_properties = ["_sdc_repository"] + key_properties = ["id"] use_organization = True path = "repos/{}/{}" parent = 'repos' id_keys = ['name', 'default_branch'] has_children = True children = ["repo_forked_compares"] - pk_child_fields = ['_sdc_repository'] + pk_child_fields = ['id'] def add_fields_at_1st_level(self, record, parent_record = None): """ Add fields in the record explicitly at the 1st level of JSON. """ - record['fork_name'] = record['parent']['owner']['login'] + record['fork_name'] = record['parent']['name'] record['fork_owner_login'] = record['parent']['owner']['login'] record['fork_default_branch'] = record['parent']['default_branch'] @@ -860,12 +860,12 @@ class RepoForkedCompares(FullTableStream): ''' tap_stream_id = "repo_forked_compares" replication_method = "FULL_TABLE" - key_properties = ["_sdc_repository"] + key_properties = ["full_name"] use_organization = True path = "repos/{}/{}/compare/{}...{}:{}" parent = 'repo_forked_parents' id_keys = ['fork_owner_login', 'fork_default_branch'] - skip_pagination = False # No pagination required for this stream. + skip_pagination = True # No pagination required for this stream. def add_fields_at_1st_level(self, record, parent_record = None): """ From 5464ab15f6241c2d90b98265ed780414e726e9b0 Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Tue, 14 Jan 2025 15:27:12 -0300 Subject: [PATCH 37/52] Fix tests --- tests/unittests/test_sync_endpoint.py | 34 +++++++++++++-------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/tests/unittests/test_sync_endpoint.py b/tests/unittests/test_sync_endpoint.py index 338d9ea4..86bc4348 100644 --- a/tests/unittests/test_sync_endpoint.py +++ b/tests/unittests/test_sync_endpoint.py @@ -37,7 +37,7 @@ def test_sync_without_state(self, mock_write_records, mock_authed_all_pages, moc self.assertEqual(final_state, expected_state) # Verify `get_auth_all_pages` called with expected url - mock_authed_all_pages.assert_called_with(mock.ANY, 'https://api.github.com/repos/tap-github/events', mock.ANY, stream='events') + mock_authed_all_pages.assert_called_with(mock.ANY, 'https://api.github.com/repos/tap-github/events', mock.ANY, stream='events', skip_pagination=False) # Verify `write_records` call count self.assertEqual(mock_write_records.call_count, 4) @@ -65,7 +65,7 @@ def test_sync_with_state(self, mock_write_records, mock_authed_all_pages, mock_v self.assertEqual(mock_write_records.call_count, 3) # Verify `get_auth_all_pages` called with expected url - mock_authed_all_pages.assert_called_with(mock.ANY, 'https://api.github.com/repos/tap-github/events', mock.ANY, stream='events') + mock_authed_all_pages.assert_called_with(mock.ANY, 'https://api.github.com/repos/tap-github/events', mock.ANY, stream='events', skip_pagination=False) mock_write_records.assert_called_with(mock.ANY, {'id': 4, 'created_at': '2019-01-02T00:00:00Z', '_sdc_repository': 'tap-github'},time_extracted = mock.ANY) @@ -91,7 +91,7 @@ def test_without_child_stream(self, mock_get_child_records, mock_authed_get_all_ test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["stargazers"], ["stargazers"]) # Verify that the authed_get_all_pages() is called with the expected url - mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/stargazers", mock.ANY, stream='stargazers') + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/stargazers", mock.ANY, stream='stargazers', skip_pagination=False) # Verify that the get_child_records() is not called as Stargazers doesn't have a child stream self.assertFalse(mock_get_child_records.called) @@ -110,7 +110,7 @@ def test_with_child_streams(self, mock_get_child_records, mock_authed_get_all_p test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["teams", "team_members"], ["teams","team_members"]) # Verify that the authed_get_all_pages() is called with the expected url - mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/orgs/tap-github/teams", mock.ANY, stream='teams') + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/orgs/tap-github/teams", mock.ANY, stream='teams', skip_pagination=False) # Verify that the get_child_records() is called self.assertTrue(mock_get_child_records.called) @@ -136,9 +136,9 @@ def test_with_nested_child_streams(self, mock_authed_get_all_pages, mock_verify_ self.assertEqual(mock_authed_get_all_pages.call_count, 4) # Verify that the authed_get_all_pages() is called with the expected url - exp_call_1 = mock.call(mock.ANY, "https://api.github.com/orgs/tap-github/teams", mock.ANY, stream='teams') - exp_call_2 = mock.call(mock.ANY, "https://api.github.com/orgs/tap-github/teams/stitch-dev/members", stream='team_members') - exp_call_3 = mock.call(mock.ANY, "https://api.github.com/orgs/tap-github/teams/stitch-dev/memberships/log1", stream='team_memberships') + exp_call_1 = mock.call(mock.ANY, "https://api.github.com/orgs/tap-github/teams", mock.ANY, stream='teams', skip_pagination=False) + exp_call_2 = mock.call(mock.ANY, "https://api.github.com/orgs/tap-github/teams/stitch-dev/members", stream='team_members', skip_pagination=False) + exp_call_3 = mock.call(mock.ANY, "https://api.github.com/orgs/tap-github/teams/stitch-dev/memberships/log1", stream='team_memberships', skip_pagination=False) self.assertEqual(mock_authed_get_all_pages.mock_calls[0], exp_call_1) self.assertEqual(mock_authed_get_all_pages.mock_calls[1], exp_call_2) @@ -166,7 +166,7 @@ def test_without_child_stream(self, mock_get_child_records, mock_authed_get_all_ test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["commits"], ["commits"]) # Verify that the authed_get_all_pages() is called with the expected url - mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/commits?since=", mock.ANY, stream='commits') + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/commits?since=", mock.ANY, stream='commits', skip_pagination=False) # Verify that the get_child_records() is not called as Commits does not contain any child stream. self.assertFalse(mock_get_child_records.called) @@ -184,8 +184,8 @@ def test_with_child_streams(self, mock_get_child_records, mock_authed_get_all_p test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["projects", "project_columns"], ["projects","project_columns"]) # Verify that the authed_get_all_pages() is called with the expected url - mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/projects?state=all", mock.ANY, stream='projects') - + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/projects?state=all", mock.ANY, stream='projects', skip_pagination=False) + # Verify that the get_child_records() is called as thw Projects stream has a child stream self.assertTrue(mock_get_child_records.called) @@ -208,9 +208,9 @@ def test_with_nested_child_streams(self, mock_authed_get_all_pages, mock_verify_ # Verify that the authed_get_all_pages() is called expected number of times self.assertEqual(mock_authed_get_all_pages.call_count, 4) - exp_call_1 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/projects?state=all", mock.ANY, stream='projects') - exp_call_2 = mock.call(mock.ANY, "https://api.github.com/projects/1/columns", stream='project_columns') - exp_call_3 = mock.call(mock.ANY, "https://api.github.com/projects/columns/1/cards", stream='project_cards') + exp_call_1 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/projects?state=all", mock.ANY, stream='projects', skip_pagination=False) + exp_call_2 = mock.call(mock.ANY, "https://api.github.com/projects/1/columns", stream='project_columns', skip_pagination=False) + exp_call_3 = mock.call(mock.ANY, "https://api.github.com/projects/columns/1/cards", stream='project_cards', skip_pagination=False) # Verify that the API calls are done as expected with the correct url self.assertEqual(mock_authed_get_all_pages.mock_calls[0], exp_call_1) @@ -240,7 +240,7 @@ def test_without_child_stream(self, mock_get_child_records, mock_strptime_to_utc test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["pull_requests"], ["pull_requests"]) # Verify that the authed_get_all_pages() is called with the expected url - mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/pulls?state=all&sort=updated&direction=desc", stream='pull_requests') + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/pulls?state=all&sort=updated&direction=desc", stream='pull_requests', skip_pagination=False) @mock.patch("tap_github.streams.Stream.get_child_records") @@ -257,7 +257,7 @@ def test_with_child_streams(self, mock_get_child_records, mock_strptime_to_utc, test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["pull_requests", "review_comments"], ["pull_requests","review_comments"]) # Verify that the authed_get_all_pages() is called with the expected url - mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/pulls?state=all&sort=updated&direction=desc", stream='pull_requests') + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/pulls?state=all&sort=updated&direction=desc", stream='pull_requests', skip_pagination=False) # Verify that the get_child_records() is called as the PullRequests stream has a child stream self.assertTrue(mock_get_child_records.called) @@ -281,8 +281,8 @@ def test_with_nested_child_streams(self, mock_strptime_to_utc, mock_authed_get_a self.assertEqual(mock_authed_get_all_pages.call_count, 2) print(mock_authed_get_all_pages.mock_calls) - exp_call_1 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/pulls?state=all&sort=updated&direction=desc", stream='pull_requests') - exp_call_2 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/pulls/1/comments?sort=updated_at&direction=desc", stream='review_comments') + exp_call_1 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/pulls?state=all&sort=updated&direction=desc", stream='pull_requests', skip_pagination=False) + exp_call_2 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/pulls/1/comments?sort=updated_at&direction=desc", stream='review_comments', skip_pagination=False) # Verify that the API calls are done as expected with the correct url self.assertEqual(mock_authed_get_all_pages.mock_calls[0], exp_call_1) From 0d4347cecab1c2575325a93eaa0b276bee3ef17c Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Tue, 14 Jan 2025 18:20:41 -0300 Subject: [PATCH 38/52] Bump version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1236da87..fe4ece57 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='2.0.8', + version='2.0.9', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', From 3a94da032ac416c537e2890f476203a68661c396 Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Thu, 16 Jan 2025 13:42:22 -0300 Subject: [PATCH 39/52] Update tap_github/streams.py Co-authored-by: Igor Khrol --- tap_github/streams.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_github/streams.py b/tap_github/streams.py index b1b00073..06f7dfa5 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -200,7 +200,7 @@ def get_child_records(self, singer.write_record(child_object.tap_stream_id, rec, time_extracted=extraction_time) - # Loop thru each child and nested child in the parent and fetch all the child records. + # Loop through each child and nested child in the parent and fetch all the child records. for nested_child in child_object.children: if nested_child in stream_to_sync: # Collect id of child record to pass in the API of its sub-child. From 1183f661d9082e4cc34f87c178232522fda5a051 Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Thu, 16 Jan 2025 15:01:09 -0300 Subject: [PATCH 40/52] Refactor auth get pages --- tap_github/client.py | 16 +++++++---- tap_github/streams.py | 20 +++++++------- tests/unittests/test_exception_handling.py | 4 +-- tests/unittests/test_get_all_repos.py | 20 +++++++------- tests/unittests/test_sync_endpoint.py | 32 +++++++++++----------- tests/unittests/test_timeout.py | 4 +-- 6 files changed, 50 insertions(+), 46 deletions(-) diff --git a/tap_github/client.py b/tap_github/client.py index 370fea9c..932d1b86 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -206,7 +206,7 @@ def set_auth_in_session(self): @backoff.on_exception(backoff.expo, (requests.Timeout, requests.ConnectionError, Server5xxError, TooManyRequests), max_tries=5, factor=2) @backoff.on_exception(backoff.expo, (BadCredentialsException, ), max_tries=3, factor=2) - def authed_get(self, source, url, headers={}, stream="", should_skip_404 = True): + def authed_get_single_page(self, source, url, headers={}, stream="", should_skip_404 = True): """ Call rest API and return the response in case of status code 200. """ @@ -222,19 +222,23 @@ def authed_get(self, source, url, headers={}, stream="", should_skip_404 = True) resp._content = b'{}' # pylint: disable=protected-access return resp - def authed_get_all_pages(self, source, url, headers={}, stream="", should_skip_404 = True, skip_pagination=False): + def authed_get_all_pages(self, source, url, headers={}, stream="", should_skip_404 = True): """ Fetch all pages of records and return them. """ next_url = self.prepare_url(url) while next_url: - response = self.authed_get(source, next_url, headers, stream, should_skip_404) + response = self.authed_get_single_page(source, next_url, headers, stream, should_skip_404) yield response - if skip_pagination: - break next_url = response.links.get('next', {}).get('url', None) + def authed_get(self, source, url, headers={}, stream="", should_skip_404=True, single_page=False): + if single_page: + yield self.authed_get_single_page(source, url, headers, stream, should_skip_404) + else: + yield from self.authed_get_all_pages(source, url, headers, stream, should_skip_404) + def prepare_url(self, url): """ Prepare the URL with some additional parameters @@ -249,7 +253,7 @@ def verify_repo_access(self, url_for_repo, repo): Call rest API to verify that the user has sufficient permissions to access this repository. """ try: - self.authed_get("verifying repository access", url_for_repo, should_skip_404=False) + self.authed_get_single_page("verifying repository access", url_for_repo, should_skip_404=False) except NotFoundException: # Throwing user-friendly error message as it checks token access message = "HTTP-error-code: 404, Error: Please check the repository name \'{}\' or you do not have sufficient permissions to access this repository.".format(repo) diff --git a/tap_github/streams.py b/tap_github/streams.py index b1b00073..16090de8 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -70,7 +70,7 @@ class Stream: use_repository = False headers = {'Accept': '*/*'} parent = None - skip_pagination = False + single_page = False def build_url(self, base_url, repo_path, bookmark): """ @@ -156,11 +156,11 @@ def get_child_records(self, stream_catalog = get_schema(catalog, child_object.tap_stream_id) with metrics.record_counter(child_object.tap_stream_id) as counter: - for response in client.authed_get_all_pages( + for response in client.authed_get( child_object.tap_stream_id, child_full_url, stream = child_object.tap_stream_id, - skip_pagination = child_object.skip_pagination + single_page = child_object.single_page ): records = response.json() extraction_time = singer.utils.now() @@ -238,12 +238,12 @@ def sync_endpoint(self, stream_catalog = get_schema(catalog, self.tap_stream_id) with metrics.record_counter(self.tap_stream_id) as counter: - for response in client.authed_get_all_pages( + for response in client.authed_get( self.tap_stream_id, full_url, self.headers, stream = self.tap_stream_id, - skip_pagination = self.skip_pagination + single_page = self.single_page ): records = response.json() extraction_time = singer.utils.now() @@ -310,12 +310,12 @@ def sync_endpoint(self, stream_catalog = get_schema(catalog, self.tap_stream_id) with metrics.record_counter(self.tap_stream_id) as counter: - for response in client.authed_get_all_pages( + for response in client.authed_get( self.tap_stream_id, full_url, self.headers, stream = self.tap_stream_id, - skip_pagination = self.skip_pagination + single_page = self.single_page ): records = response.json() extraction_time = singer.utils.now() @@ -398,11 +398,11 @@ def sync_endpoint(self, parent_bookmark_value = bookmark_value record_counter = 0 with metrics.record_counter(self.tap_stream_id) as counter: - for response in client.authed_get_all_pages( + for response in client.authed_get( self.tap_stream_id, full_url, stream = self.tap_stream_id, - skip_pagination = self.skip_pagination + single_page = self.single_page ): records = response.json() extraction_time = singer.utils.now() @@ -865,7 +865,7 @@ class RepoForkedCompares(FullTableStream): path = "repos/{}/{}/compare/{}...{}:{}" parent = 'repo_forked_parents' id_keys = ['fork_owner_login', 'fork_default_branch'] - skip_pagination = True # No pagination required for this stream. + single_page = True # No pagination required for this stream. def add_fields_at_1st_level(self, record, parent_record = None): """ diff --git a/tests/unittests/test_exception_handling.py b/tests/unittests/test_exception_handling.py index 05e48851..27f85be3 100644 --- a/tests/unittests/test_exception_handling.py +++ b/tests/unittests/test_exception_handling.py @@ -83,7 +83,7 @@ def test_error_message_and_call_count(self, mocked_parse_args, mocked_request, m expected_error_message = "HTTP-error-code: {}, Error: {}".format(erro_code, error_msg) with self.assertRaises(error_class) as e: - test_client.authed_get("", "") + test_client.authed_get_single_page("", "") # Verifying the message formed for the custom exception self.assertEqual(str(e.exception), expected_error_message) @@ -101,7 +101,7 @@ def test_skip_404_error(self, mock_logger, mocked_parse_args, mocked_request, m expected_message = "HTTP-error-code: 404, Error: The resource you have specified cannot be found. Alternatively the access_token is not valid for the resource. Please refer '{}' for more details.".format(json.get("documentation_url")) test_client = GithubClient(self.config) - test_client.authed_get("", "") + test_client.authed_get_single_page("", "") # Verifying the message formed for the custom exception self.assertEqual(mock_logger.mock_calls[0], mock.call(expected_message)) diff --git a/tests/unittests/test_get_all_repos.py b/tests/unittests/test_get_all_repos.py index 5db16fef..fd332381 100644 --- a/tests/unittests/test_get_all_repos.py +++ b/tests/unittests/test_get_all_repos.py @@ -90,39 +90,39 @@ def test_multiple_organizations(self, mocked_authed_get_all_pages, mocked_verify self.assertListEqual(expected_repositories, side_effect) @mock.patch('tap_github.client.GithubClient.verify_repo_access') -@mock.patch('tap_github.client.GithubClient.authed_get') +@mock.patch('tap_github.client.GithubClient.authed_get_single_page') class TestAuthedGetAllPages(unittest.TestCase): """ Test `authed_get_all_pages` method from client. """ config = {"access_token": "", "repository": "test-org/repo1", "max_per_page": 100} - def test_for_one_page(self, mock_auth_get, mock_verify_access): + def test_for_one_page(self, mock_auth_get_single_page, mock_verify_access): """Verify `authed_get` is called only once if one page is available.""" test_client = GithubClient(self.config) - mock_auth_get.return_value = MockResponse({}) + mock_auth_get_single_page.return_value = MockResponse({}) list(test_client.authed_get_all_pages("", "http://mock_url", {})) # Verify `auth_get` call count - self.assertEqual(mock_auth_get.call_count, 1) + self.assertEqual(mock_auth_get_single_page.call_count, 1) - def test_for_multiple_pages(self, mock_auth_get, mock_verify_access): + def test_for_multiple_pages(self, mock_auth_get_single_page, mock_verify_access): """Verify `authed_get` is called equal number times as pages available.""" test_client = GithubClient(self.config) - mock_auth_get.side_effect = [MockResponse({"next": {"url": "http://mock_url_2/?per_page=100"}}), + mock_auth_get_single_page.side_effect = [MockResponse({"next": {"url": "http://mock_url_2/?per_page=100"}}), MockResponse({"next": {"url": "http://mock_url_3/?per_page=100"}}),MockResponse({})] list(test_client.authed_get_all_pages("", "http://mock_url_1", {})) # Verify `auth_get` call count - self.assertEqual(mock_auth_get.call_count, 3) + self.assertEqual(mock_auth_get_single_page.call_count, 3) # Verify `auth_get` calls with expected url - self.assertEqual(mock_auth_get.mock_calls[0], mock.call("", "http://mock_url_1/?per_page=100", {}, '', True)) - self.assertEqual(mock_auth_get.mock_calls[1], mock.call("", "http://mock_url_2/?per_page=100", {}, '', True)) - self.assertEqual(mock_auth_get.mock_calls[2], mock.call("", "http://mock_url_3/?per_page=100", {}, '', True)) + self.assertEqual(mock_auth_get_single_page.mock_calls[0], mock.call("", "http://mock_url_1/?per_page=100", {}, '', True)) + self.assertEqual(mock_auth_get_single_page.mock_calls[1], mock.call("", "http://mock_url_2/?per_page=100", {}, '', True)) + self.assertEqual(mock_auth_get_single_page.mock_calls[2], mock.call("", "http://mock_url_3/?per_page=100", {}, '', True)) diff --git a/tests/unittests/test_sync_endpoint.py b/tests/unittests/test_sync_endpoint.py index 86bc4348..4ba60e47 100644 --- a/tests/unittests/test_sync_endpoint.py +++ b/tests/unittests/test_sync_endpoint.py @@ -37,7 +37,7 @@ def test_sync_without_state(self, mock_write_records, mock_authed_all_pages, moc self.assertEqual(final_state, expected_state) # Verify `get_auth_all_pages` called with expected url - mock_authed_all_pages.assert_called_with(mock.ANY, 'https://api.github.com/repos/tap-github/events', mock.ANY, stream='events', skip_pagination=False) + mock_authed_all_pages.assert_called_with(mock.ANY, 'https://api.github.com/repos/tap-github/events', mock.ANY, 'events', True) # Verify `write_records` call count self.assertEqual(mock_write_records.call_count, 4) @@ -65,7 +65,7 @@ def test_sync_with_state(self, mock_write_records, mock_authed_all_pages, mock_v self.assertEqual(mock_write_records.call_count, 3) # Verify `get_auth_all_pages` called with expected url - mock_authed_all_pages.assert_called_with(mock.ANY, 'https://api.github.com/repos/tap-github/events', mock.ANY, stream='events', skip_pagination=False) + mock_authed_all_pages.assert_called_with(mock.ANY, 'https://api.github.com/repos/tap-github/events', mock.ANY, 'events', True) mock_write_records.assert_called_with(mock.ANY, {'id': 4, 'created_at': '2019-01-02T00:00:00Z', '_sdc_repository': 'tap-github'},time_extracted = mock.ANY) @@ -91,7 +91,7 @@ def test_without_child_stream(self, mock_get_child_records, mock_authed_get_all_ test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["stargazers"], ["stargazers"]) # Verify that the authed_get_all_pages() is called with the expected url - mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/stargazers", mock.ANY, stream='stargazers', skip_pagination=False) + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/stargazers", mock.ANY, 'stargazers', True) # Verify that the get_child_records() is not called as Stargazers doesn't have a child stream self.assertFalse(mock_get_child_records.called) @@ -110,7 +110,7 @@ def test_with_child_streams(self, mock_get_child_records, mock_authed_get_all_p test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["teams", "team_members"], ["teams","team_members"]) # Verify that the authed_get_all_pages() is called with the expected url - mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/orgs/tap-github/teams", mock.ANY, stream='teams', skip_pagination=False) + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/orgs/tap-github/teams", mock.ANY, 'teams', True) # Verify that the get_child_records() is called self.assertTrue(mock_get_child_records.called) @@ -136,9 +136,9 @@ def test_with_nested_child_streams(self, mock_authed_get_all_pages, mock_verify_ self.assertEqual(mock_authed_get_all_pages.call_count, 4) # Verify that the authed_get_all_pages() is called with the expected url - exp_call_1 = mock.call(mock.ANY, "https://api.github.com/orgs/tap-github/teams", mock.ANY, stream='teams', skip_pagination=False) - exp_call_2 = mock.call(mock.ANY, "https://api.github.com/orgs/tap-github/teams/stitch-dev/members", stream='team_members', skip_pagination=False) - exp_call_3 = mock.call(mock.ANY, "https://api.github.com/orgs/tap-github/teams/stitch-dev/memberships/log1", stream='team_memberships', skip_pagination=False) + exp_call_1 = mock.call(mock.ANY, "https://api.github.com/orgs/tap-github/teams", mock.ANY, 'teams', True) + exp_call_2 = mock.call(mock.ANY, "https://api.github.com/orgs/tap-github/teams/stitch-dev/members", {}, 'team_members', True) + exp_call_3 = mock.call(mock.ANY, "https://api.github.com/orgs/tap-github/teams/stitch-dev/memberships/log1", {}, 'team_memberships', True) self.assertEqual(mock_authed_get_all_pages.mock_calls[0], exp_call_1) self.assertEqual(mock_authed_get_all_pages.mock_calls[1], exp_call_2) @@ -166,7 +166,7 @@ def test_without_child_stream(self, mock_get_child_records, mock_authed_get_all_ test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["commits"], ["commits"]) # Verify that the authed_get_all_pages() is called with the expected url - mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/commits?since=", mock.ANY, stream='commits', skip_pagination=False) + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/commits?since=", mock.ANY, 'commits', True) # Verify that the get_child_records() is not called as Commits does not contain any child stream. self.assertFalse(mock_get_child_records.called) @@ -184,7 +184,7 @@ def test_with_child_streams(self, mock_get_child_records, mock_authed_get_all_p test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["projects", "project_columns"], ["projects","project_columns"]) # Verify that the authed_get_all_pages() is called with the expected url - mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/projects?state=all", mock.ANY, stream='projects', skip_pagination=False) + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/projects?state=all", mock.ANY, 'projects', True) # Verify that the get_child_records() is called as thw Projects stream has a child stream self.assertTrue(mock_get_child_records.called) @@ -208,9 +208,9 @@ def test_with_nested_child_streams(self, mock_authed_get_all_pages, mock_verify_ # Verify that the authed_get_all_pages() is called expected number of times self.assertEqual(mock_authed_get_all_pages.call_count, 4) - exp_call_1 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/projects?state=all", mock.ANY, stream='projects', skip_pagination=False) - exp_call_2 = mock.call(mock.ANY, "https://api.github.com/projects/1/columns", stream='project_columns', skip_pagination=False) - exp_call_3 = mock.call(mock.ANY, "https://api.github.com/projects/columns/1/cards", stream='project_cards', skip_pagination=False) + exp_call_1 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/projects?state=all", mock.ANY, 'projects', True) + exp_call_2 = mock.call(mock.ANY, "https://api.github.com/projects/1/columns", {}, 'project_columns', True) + exp_call_3 = mock.call(mock.ANY, "https://api.github.com/projects/columns/1/cards", {}, 'project_cards', True) # Verify that the API calls are done as expected with the correct url self.assertEqual(mock_authed_get_all_pages.mock_calls[0], exp_call_1) @@ -240,7 +240,7 @@ def test_without_child_stream(self, mock_get_child_records, mock_strptime_to_utc test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["pull_requests"], ["pull_requests"]) # Verify that the authed_get_all_pages() is called with the expected url - mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/pulls?state=all&sort=updated&direction=desc", stream='pull_requests', skip_pagination=False) + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/pulls?state=all&sort=updated&direction=desc", {}, 'pull_requests', True) @mock.patch("tap_github.streams.Stream.get_child_records") @@ -257,7 +257,7 @@ def test_with_child_streams(self, mock_get_child_records, mock_strptime_to_utc, test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["pull_requests", "review_comments"], ["pull_requests","review_comments"]) # Verify that the authed_get_all_pages() is called with the expected url - mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/pulls?state=all&sort=updated&direction=desc", stream='pull_requests', skip_pagination=False) + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/pulls?state=all&sort=updated&direction=desc", {}, 'pull_requests', True) # Verify that the get_child_records() is called as the PullRequests stream has a child stream self.assertTrue(mock_get_child_records.called) @@ -281,8 +281,8 @@ def test_with_nested_child_streams(self, mock_strptime_to_utc, mock_authed_get_a self.assertEqual(mock_authed_get_all_pages.call_count, 2) print(mock_authed_get_all_pages.mock_calls) - exp_call_1 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/pulls?state=all&sort=updated&direction=desc", stream='pull_requests', skip_pagination=False) - exp_call_2 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/pulls/1/comments?sort=updated_at&direction=desc", stream='review_comments', skip_pagination=False) + exp_call_1 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/pulls?state=all&sort=updated&direction=desc", {}, 'pull_requests', True) + exp_call_2 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/pulls/1/comments?sort=updated_at&direction=desc", {}, 'review_comments', True) # Verify that the API calls are done as expected with the correct url self.assertEqual(mock_authed_get_all_pages.mock_calls[0], exp_call_1) diff --git a/tests/unittests/test_timeout.py b/tests/unittests/test_timeout.py index a3f6ca53..09129684 100644 --- a/tests/unittests/test_timeout.py +++ b/tests/unittests/test_timeout.py @@ -73,7 +73,7 @@ def test_timeout_value_in_config(self, mocked_parse_args, mocked_request, mocked # get the timeout value for assertion timeout = test_client.get_request_timeout() # function call - test_client.authed_get("test_source", "") + test_client.authed_get_single_page("test_source", "") # verify that we got expected timeout value self.assertEqual(expected_value, timeout) @@ -107,7 +107,7 @@ def test_backoff(self, mocked_parse_args, mocked_request, mocked_sleep, mock_ver test_client = GithubClient(mock_config) with self.assertRaises(error_class): - test_client.authed_get("test_source", "") + test_client.authed_get_single_page("test_source", "") # verify that we backoff 5 times self.assertEqual(5, mocked_request.call_count) From 9db65db175d569645f3d2d1276d5caff34b6adc4 Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Fri, 17 Jan 2025 13:05:29 -0300 Subject: [PATCH 41/52] Add error_message --- tap_github/schemas/repo_forked_compares.json | 3 +++ tap_github/streams.py | 11 +++++++++++ 2 files changed, 14 insertions(+) diff --git a/tap_github/schemas/repo_forked_compares.json b/tap_github/schemas/repo_forked_compares.json index 0b756fbf..70ad5d02 100644 --- a/tap_github/schemas/repo_forked_compares.json +++ b/tap_github/schemas/repo_forked_compares.json @@ -69,6 +69,9 @@ "items": { "type": ["null", "object"] } + }, + "error_message": { + "type": ["null", "string"] } } } diff --git a/tap_github/streams.py b/tap_github/streams.py index a61fde6c..627eb45e 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -847,6 +847,13 @@ def get_child_records(self, except UnprocessableError as e: error_message = str(e) if "HTTP-error-code: 422" in error_message and "Sorry, this diff is taking too long to generate" in error_message: + child_object = STREAMS[child_stream]() + record = {'_sdc_repository': repo_path, 'error_message': error_message} + child_object.add_fields_at_1st_level(record=record, parent_record=parent_record) + with singer.Transformer() as transformer: + if child_object.tap_stream_id in selected_stream_ids: + singer.write_record(child_object.tap_stream_id, record, time_extracted=singer.utils.now()) + LOGGER.warning(f'Can\'t compare the forked repository ({selected_stream_ids}) with the parent repository. Error: {error_message}') else: raise e @@ -878,6 +885,10 @@ def add_fields_at_1st_level(self, record, parent_record = None): record['fork_owner_login'] = parent_record['fork_owner_login'] record['fork_name'] = parent_record['fork_name'] record['fork_default_branch'] = parent_record['fork_default_branch'] + if 'status' not in record and 'error_message' not in record: + record['error_message'] = ('Cannot retrieve ahead/behind information for this branch. ' + 'It may happen if the parent repository has been deleted or ' + 'no common ancestor between the default branches.') # Dictionary of the stream classes From c85e5239ae3d5a10e6d0b0d5ad0b5e11cf9434d9 Mon Sep 17 00:00:00 2001 From: Igor Khrol Date: Tue, 26 Aug 2025 16:14:32 +0300 Subject: [PATCH 42/52] Python 3.12 support: update requests dependency --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index fe4ece57..f462d2d2 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='2.0.9', + version='2.0.10', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', @@ -11,7 +11,7 @@ py_modules=['tap_github'], install_requires=[ 'singer-python==5.12.1', - 'requests==2.20.0', + 'requests==2.32.5', 'backoff==1.8.0' ], extras_require={ From 281bb76c9654d0a63262947ac4ea583096c827a6 Mon Sep 17 00:00:00 2001 From: Igor Khrol Date: Tue, 2 Sep 2025 12:26:18 +0300 Subject: [PATCH 43/52] Freeze constraints --- constraints.txt | 36 ++++++++++++++++++++++++++++++++++++ setup.py | 2 +- 2 files changed, 37 insertions(+), 1 deletion(-) create mode 100644 constraints.txt diff --git a/constraints.txt b/constraints.txt new file mode 100644 index 00000000..b8c0d08f --- /dev/null +++ b/constraints.txt @@ -0,0 +1,36 @@ +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# pip-compile --output-file=constraints.txt setup.py +# +--index-url https://nexus.a8c.com/repository/pypi/simple + +backoff==1.8.0 + # via + # singer-python + # tap-github (setup.py) +certifi==2025.8.3 + # via requests +charset-normalizer==3.4.3 + # via requests +ciso8601==2.3.3 + # via singer-python +idna==3.10 + # via requests +jsonschema==2.6.0 + # via singer-python +python-dateutil==2.9.0.post0 + # via singer-python +pytz==2018.4 + # via singer-python +requests==2.32.5 + # via tap-github (setup.py) +simplejson==3.11.1 + # via singer-python +singer-python==5.12.1 + # via tap-github (setup.py) +six==1.17.0 + # via python-dateutil +urllib3==2.5.0 + # via requests diff --git a/setup.py b/setup.py index f462d2d2..0c350267 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='2.0.10', + version='2.0.11', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', From 195d731aed46f7f5fe18e38fd9b655e14fb1df7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaros=C5=82aw=20Cellary?= Date: Thu, 30 Oct 2025 09:32:17 +0100 Subject: [PATCH 44/52] Rate limit fixes --- setup.py | 2 +- tap_github/client.py | 22 ++++++++++++++++++++-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 0c350267..c4e19738 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='2.0.11', + version='2.0.12', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', diff --git a/tap_github/client.py b/tap_github/client.py index 932d1b86..b0e88e2d 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -16,7 +16,11 @@ # Set default timeout of 300 seconds REQUEST_TIMEOUT = 300 +# How many total seconds to retry when getting rate limit error from API +RATE_LIMIT_RETRY_MAX_TIME = 600 + PAGINATION_EXCEED_MSG = 'In order to keep the API fast for everyone, pagination is limited for this resource.' +RATE_LIMIT_EXCEED_MSG = 'API rate limit exceeded' class GithubException(Exception): pass @@ -54,6 +58,9 @@ class ConflictError(GithubException): class RateLimitExceeded(GithubException): pass +class RateLimitSleepExceeded(GithubException): + pass + class TooManyRequests(GithubException): pass @@ -111,6 +118,11 @@ def raise_for_error(resp, source, stream, client, should_skip_404): except JSONDecodeError: response_json = {} + if error_code == 403 and RATE_LIMIT_EXCEED_MSG in response_json.get('message', ''): + message = f"HTTP-error-code: 403, Error: {response_json.get('message', '')}. " + LOGGER.warning(message) + raise RateLimitExceeded() from None + if error_code == 404 and should_skip_404: # Add not accessible stream into list. client.not_accessible_repos.add(stream) @@ -150,13 +162,18 @@ def rate_throttling(response, max_sleep_seconds, min_remain_rate_limit): """ For rate limit errors, get the remaining time before retrying and calculate the time to sleep before making a new request. """ + if "Retry-After" in response.headers: + # handles the secondary rate limit + seconds_to_sleep = int(response.headers['Retry-After']) + LOGGER.info("Retry-After header found in response. Tap will retry the data collection after %s seconds.", seconds_to_sleep) + time.sleep(seconds_to_sleep) if 'X-RateLimit-Remaining' in response.headers: if int(response.headers['X-RateLimit-Remaining']) <= min_remain_rate_limit: - seconds_to_sleep = calculate_seconds(int(response.headers['X-RateLimit-Reset'])) + seconds_to_sleep = calculate_seconds(int(response.headers['X-RateLimit-Reset']) + 15) if seconds_to_sleep > max_sleep_seconds: message = "API rate limit exceeded, please try after {} seconds.".format(seconds_to_sleep) - raise RateLimitExceeded(message) from None + raise RateLimitSleepExceeded(message) from None LOGGER.info("API rate limit exceeded. Tap will retry the data collection after %s seconds.", seconds_to_sleep) time.sleep(seconds_to_sleep) @@ -206,6 +223,7 @@ def set_auth_in_session(self): @backoff.on_exception(backoff.expo, (requests.Timeout, requests.ConnectionError, Server5xxError, TooManyRequests), max_tries=5, factor=2) @backoff.on_exception(backoff.expo, (BadCredentialsException, ), max_tries=3, factor=2) + @backoff.on_exception(backoff.constant, (RateLimitExceeded, ), jitter=None, interval=60, max_time=RATE_LIMIT_RETRY_MAX_TIME) def authed_get_single_page(self, source, url, headers={}, stream="", should_skip_404 = True): """ Call rest API and return the response in case of status code 200. From c557810606a23e2fa7aacccb1f233255a46428d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaros=C5=82aw=20Cellary?= Date: Thu, 30 Oct 2025 09:41:55 +0100 Subject: [PATCH 45/52] style fix --- tap_github/client.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tap_github/client.py b/tap_github/client.py index b0e88e2d..c3c6639e 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -118,8 +118,10 @@ def raise_for_error(resp, source, stream, client, should_skip_404): except JSONDecodeError: response_json = {} - if error_code == 403 and RATE_LIMIT_EXCEED_MSG in response_json.get('message', ''): - message = f"HTTP-error-code: 403, Error: {response_json.get('message', '')}. " + response_message = response_json.get('message', '') + + if error_code == 403 and RATE_LIMIT_EXCEED_MSG in response_message: + message = f"HTTP-error-code: 403, Error: {response_message}" LOGGER.warning(message) raise RateLimitExceeded() from None @@ -134,8 +136,8 @@ def raise_for_error(resp, source, stream, client, should_skip_404): # Don't raise a NotFoundException return None - if error_code == 422 and PAGINATION_EXCEED_MSG in response_json.get('message', ''): - message = f"HTTP-error-code: 422, Error: {response_json.get('message', '')}. " \ + if error_code == 422 and PAGINATION_EXCEED_MSG in response_message: + message = f"HTTP-error-code: 422, Error: {response_message}. " \ f"Please refer '{response_json.get('documentation_url')}' for more details." \ "This is a known issue when the results exceed 40k and the last page is not full" \ " (it will trim the results to get only the available by the API)." @@ -223,7 +225,7 @@ def set_auth_in_session(self): @backoff.on_exception(backoff.expo, (requests.Timeout, requests.ConnectionError, Server5xxError, TooManyRequests), max_tries=5, factor=2) @backoff.on_exception(backoff.expo, (BadCredentialsException, ), max_tries=3, factor=2) - @backoff.on_exception(backoff.constant, (RateLimitExceeded, ), jitter=None, interval=60, max_time=RATE_LIMIT_RETRY_MAX_TIME) + @backoff.on_exception(backoff.constant, (RateLimitExceeded, ), interval=60, jitter=None, max_time=RATE_LIMIT_RETRY_MAX_TIME) def authed_get_single_page(self, source, url, headers={}, stream="", should_skip_404 = True): """ Call rest API and return the response in case of status code 200. From 5fd917727a1f2def5cb02ec26a792a36356af88c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaros=C5=82aw=20Cellary?= Date: Thu, 30 Oct 2025 09:48:30 +0100 Subject: [PATCH 46/52] fixed limit --- tap_github/client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tap_github/client.py b/tap_github/client.py index c3c6639e..94c9cff7 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -16,8 +16,8 @@ # Set default timeout of 300 seconds REQUEST_TIMEOUT = 300 -# How many total seconds to retry when getting rate limit error from API -RATE_LIMIT_RETRY_MAX_TIME = 600 +# How many total seconds to retry when getting rate limit error from API. The limit resets every hour. +RATE_LIMIT_RETRY_MAX_TIME = 3600 PAGINATION_EXCEED_MSG = 'In order to keep the API fast for everyone, pagination is limited for this resource.' RATE_LIMIT_EXCEED_MSG = 'API rate limit exceeded' From 46fa9aa711299881ca6527bf7dd84416d0dd5a2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaros=C5=82aw=20Cellary?= Date: Thu, 30 Oct 2025 11:25:46 +0100 Subject: [PATCH 47/52] added comments to exceptions --- tap_github/client.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tap_github/client.py b/tap_github/client.py index 94c9cff7..e182462f 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -55,12 +55,15 @@ class MovedPermanentlyError(GithubException): class ConflictError(GithubException): pass +# Thrown when we receive 403 Rate Limit Exceeded from Github API class RateLimitExceeded(GithubException): pass +# Thrown when we're expected to sleep for longer than the max_sleep_seconds limit class RateLimitSleepExceeded(GithubException): pass +# Thrown when 429 is received from Github API class TooManyRequests(GithubException): pass From e1577f4510feca8cf58624322964c57b4e1c5683 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaros=C5=82aw=20Cellary?= Date: Wed, 12 Nov 2025 13:45:29 +0100 Subject: [PATCH 48/52] Added _sdc_repository to commit comments --- setup.py | 2 +- tap_github/schemas/commit_comments.json | 221 ++++++++++++------------ 2 files changed, 113 insertions(+), 110 deletions(-) diff --git a/setup.py b/setup.py index c4e19738..2cbde148 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='2.0.12', + version='2.0.13', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', diff --git a/tap_github/schemas/commit_comments.json b/tap_github/schemas/commit_comments.json index 408448dc..3c8679ef 100644 --- a/tap_github/schemas/commit_comments.json +++ b/tap_github/schemas/commit_comments.json @@ -1,114 +1,117 @@ { - "type": ["null", "object"], - "properties": { - "html_url": { - "type": ["null", "string"] - }, - "url": { - "type": ["null", "string"] - }, - "id": { - "type": ["null", "number"] - }, - "node_id": { - "type": ["null", "string"] - }, - "body": { - "type": ["null", "string"] - }, - "path": { - "type": ["null", "string"] - }, - "position": { - "type": ["null", "number"] - }, - "line": { - "type": ["null", "number"] - }, - "commit_id": { - "type": ["null", "string"] - }, - "user": { - "type": ["null", "object"], - "properties": { - "name": { - "type": ["null", "string"] - }, - "email": { - "type": ["null", "string"] - }, - "login": { - "type": ["null", "string"] - }, - "id": { - "type": ["null", "number"] - }, - "node_id": { - "type": ["null", "string"] - }, - "avatar_url": { - "type": ["null", "string"] - }, - "gravatar_id": { - "type": ["null", "string"] - }, - "url": { - "type": ["null", "string"] - }, - "html_url": { - "type": ["null", "string"] - }, - "followers_url": { - "type": ["null", "string"] - }, - "following_url": { - "type": ["null", "string"] - }, - "gists_url": { - "type": ["null", "string"] - }, - "starred_url": { - "type": ["null", "string"] - }, - "subscriptions_url": { - "type": ["null", "string"] - }, - "organizations_url": { - "type": ["null", "string"] - }, - "repos_url": { - "type": ["null", "string"] - }, - "events_url": { - "type": ["null", "string"] - }, - "received_events_url": { - "type": ["null", "string"] - }, - "type": { - "type": ["null", "string"] - }, - "site_admin": { - "type": ["null", "boolean"] - }, - "starred_at": { - "type": ["null", "string"] - } + "type": ["null", "object"], + "properties": { + "html_url": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "number"] + }, + "node_id": { + "type": ["null", "string"] + }, + "body": { + "type": ["null", "string"] + }, + "path": { + "type": ["null", "string"] + }, + "position": { + "type": ["null", "number"] + }, + "line": { + "type": ["null", "number"] + }, + "commit_id": { + "type": ["null", "string"] + }, + "user": { + "type": ["null", "object"], + "properties": { + "name": { + "type": ["null", "string"] + }, + "email": { + "type": ["null", "string"] + }, + "login": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "number"] + }, + "node_id": { + "type": ["null", "string"] + }, + "avatar_url": { + "type": ["null", "string"] + }, + "gravatar_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "followers_url": { + "type": ["null", "string"] + }, + "following_url": { + "type": ["null", "string"] + }, + "gists_url": { + "type": ["null", "string"] + }, + "starred_url": { + "type": ["null", "string"] + }, + "subscriptions_url": { + "type": ["null", "string"] + }, + "organizations_url": { + "type": ["null", "string"] + }, + "repos_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "received_events_url": { + "type": ["null", "string"] + }, + "type": { + "type": ["null", "string"] + }, + "site_admin": { + "type": ["null", "boolean"] + }, + "starred_at": { + "type": ["null", "string"] } - }, - "created_at": { - "type": ["null", "string"], - "format": "date-time" - }, - "updated_at": { - "type": ["null", "string"], - "format": "date-time" - }, - "author_association": { - "type": ["null", "string"] - }, - "reactions": { - "$ref": "shared/reactions.json#/" } + }, + "created_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "updated_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "author_association": { + "type": ["null", "string"] + }, + "reactions": { + "$ref": "shared/reactions.json#/" + }, + "_sdc_repository": { + "type": ["string"] } + } } From fb1b137ca1b7839d7cdc4fa07905587586ffd99a Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 25 Nov 2025 01:45:54 +0000 Subject: [PATCH 49/52] Add fail-fast check for archived repositories Add validation to detect archived repositories during discovery and sync phases. The extraction will fail with ArchivedRepositoryError if a repository is archived, unless 'extract_archived' is set to 'true' in the config. Changes: - Add ArchivedRepositoryError exception class - Add extract_archived config option (default: false) - Add check_repo_archived() method to verify repo status via GitHub API - Integrate archived check into verify_access_for_repo() - Handle archived repos efficiently in get_all_repos() for org/* wildcards --- config.sample.json | 3 ++- tap_github/client.py | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/config.sample.json b/config.sample.json index 61df3707..4a9d46ef 100644 --- a/config.sample.json +++ b/config.sample.json @@ -3,5 +3,6 @@ "repository": "singer-io/target-stitch", "start_date": "2021-01-01T00:00:00Z", "request_timeout": 300, - "base_url": "https://api.github.com" + "base_url": "https://api.github.com", + "extract_archived": "false" } diff --git a/tap_github/client.py b/tap_github/client.py index e182462f..18bf5e89 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -67,6 +67,10 @@ class RateLimitSleepExceeded(GithubException): class TooManyRequests(GithubException): pass +# Thrown when repository is archived and extract_archived is not enabled +class ArchivedRepositoryError(GithubException): + pass + ERROR_CODE_EXCEPTION_MAPPING = { 301: { @@ -200,6 +204,9 @@ def __init__(self, config): self.set_auth_in_session() self.not_accessible_repos = set() self.max_per_page = self.config.get('max_per_page', DEFAULT_MAX_PER_PAGE) + # Convert string 'true'/'false' to boolean, default to False + extract_archived_value = str(self.config.get('extract_archived', 'false')).lower() + self.extract_archived = extract_archived_value == 'true' def get_request_timeout(self): """ @@ -282,9 +289,31 @@ def verify_repo_access(self, url_for_repo, repo): message = "HTTP-error-code: 404, Error: Please check the repository name \'{}\' or you do not have sufficient permissions to access this repository.".format(repo) raise NotFoundException(message) from None + def check_repo_archived(self, repo): + """ + Check if a repository is archived and raise an error if extract_archived is not enabled. + + Args: + repo: Repository in 'org/repo' format + + Raises: + ArchivedRepositoryError: If repo is archived and extract_archived config is not true + """ + url = "{}/repos/{}".format(self.base_url, repo) + response = self.authed_get_single_page("checking repository archived status", url, should_skip_404=False) + repo_info = response.json() + + if repo_info.get('archived', False): + if not self.extract_archived: + message = "Repository '{}' is archived. To extract data from archived repositories, " \ + "set 'extract_archived' to 'true' in the config.".format(repo) + raise ArchivedRepositoryError(message) + LOGGER.warning("Repository '%s' is archived. Proceeding with extraction as 'extract_archived' is enabled.", repo) + def verify_access_for_repo(self): """ For all the repositories mentioned in the config, check the access for each repos. + Also checks if repositories are archived and fails if extract_archived is not enabled. """ repositories, org = self.extract_repos_from_config() # pylint: disable=unused-variable @@ -296,6 +325,9 @@ def verify_access_for_repo(self): # Verifying for Repo access self.verify_repo_access(url_for_repo, repo) + # Check if repository is archived + self.check_repo_archived(repo) + def extract_orgs_from_config(self): """ Extracts all organizations from the config @@ -383,6 +415,14 @@ def get_all_repos(self, organizations: list): repo ) + # Check if repository is archived (info already available in response) + if repo.get('archived', False): + if not self.extract_archived: + message = "Repository '{}' is archived. To extract data from archived repositories, " \ + "set 'extract_archived' to 'true' in the config.".format(repo_full_name) + raise ArchivedRepositoryError(message) + LOGGER.warning("Repository '%s' is archived. Proceeding with extraction as 'extract_archived' is enabled.", repo_full_name) + repos.append(repo_full_name) except NotFoundException: # Throwing user-friendly error message as it checks token access From 3dde1aac5554c46e4e621e5756e13a8fec14ba17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaros=C5=82aw=20Cellary?= Date: Thu, 4 Dec 2025 08:44:29 +0100 Subject: [PATCH 50/52] bumped version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 2cbde148..37115bd3 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='2.0.13', + version='2.0.14', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', From 8e069e9906cc20f389db6f9b9ff589e917400c8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaros=C5=82aw=20Cellary?= Date: Thu, 18 Dec 2025 15:27:54 +0100 Subject: [PATCH 51/52] all issue milestones --- tap_github/streams.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_github/streams.py b/tap_github/streams.py index 627eb45e..c5bcb025 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -743,7 +743,7 @@ class IssueMilestones(IncrementalOrderedStream): replication_method = "INCREMENTAL" replication_keys = "updated_at" key_properties = ["id"] - path = "milestones?direction=desc&sort=updated_at" + path = "milestones?state=all&direction=desc&sort=updated_at" class Collaborators(FullTableStream): ''' From 759c1e565b1b6925737d63dd845f63b972f95301 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaros=C5=82aw=20Cellary?= Date: Thu, 18 Dec 2025 15:29:32 +0100 Subject: [PATCH 52/52] bumped version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 37115bd3..b93b7638 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='2.0.14', + version='2.0.15', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io',