diff --git a/README.md b/README.md index e8c4df01..7b8e8845 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,10 @@ This tap: "base_url": "https://api.github.com" } ``` + +> Note: The max results per page is configurable with the parameter `max_per_page`, +> as default it will return 100 (that is the max of most of the endpoints) + 4. Run the tap in discovery mode to get properties.json file ```bash diff --git a/config.sample.json b/config.sample.json index 61df3707..4a9d46ef 100644 --- a/config.sample.json +++ b/config.sample.json @@ -3,5 +3,6 @@ "repository": "singer-io/target-stitch", "start_date": "2021-01-01T00:00:00Z", "request_timeout": 300, - "base_url": "https://api.github.com" + "base_url": "https://api.github.com", + "extract_archived": "false" } diff --git a/constraints.txt b/constraints.txt new file mode 100644 index 00000000..b8c0d08f --- /dev/null +++ b/constraints.txt @@ -0,0 +1,36 @@ +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# pip-compile --output-file=constraints.txt setup.py +# +--index-url https://nexus.a8c.com/repository/pypi/simple + +backoff==1.8.0 + # via + # singer-python + # tap-github (setup.py) +certifi==2025.8.3 + # via requests +charset-normalizer==3.4.3 + # via requests +ciso8601==2.3.3 + # via singer-python +idna==3.10 + # via requests +jsonschema==2.6.0 + # via singer-python +python-dateutil==2.9.0.post0 + # via singer-python +pytz==2018.4 + # via singer-python +requests==2.32.5 + # via tap-github (setup.py) +simplejson==3.11.1 + # via singer-python +singer-python==5.12.1 + # via tap-github (setup.py) +six==1.17.0 + # via python-dateutil +urllib3==2.5.0 + # via requests diff --git a/setup.py b/setup.py index b6c06fef..b93b7638 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='2.0.0', + version='2.0.15', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', @@ -11,7 +11,7 @@ py_modules=['tap_github'], install_requires=[ 'singer-python==5.12.1', - 'requests==2.20.0', + 'requests==2.32.5', 'backoff==1.8.0' ], extras_require={ diff --git a/tap_github/client.py b/tap_github/client.py index 9913a8c2..18bf5e89 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -1,17 +1,27 @@ import time import requests +from requests.models import PreparedRequest import backoff from simplejson import JSONDecodeError import singer from singer import metrics +from math import ceil LOGGER = singer.get_logger() DEFAULT_SLEEP_SECONDS = 600 +DEFAULT_MIN_REMAIN_RATE_LIMIT = 0 +DEFAULT_MAX_PER_PAGE = 100 DEFAULT_DOMAIN = "https://api.github.com" # Set default timeout of 300 seconds REQUEST_TIMEOUT = 300 +# How many total seconds to retry when getting rate limit error from API. The limit resets every hour. +RATE_LIMIT_RETRY_MAX_TIME = 3600 + +PAGINATION_EXCEED_MSG = 'In order to keep the API fast for everyone, pagination is limited for this resource.' +RATE_LIMIT_EXCEED_MSG = 'API rate limit exceeded' + class GithubException(Exception): pass @@ -45,12 +55,22 @@ class MovedPermanentlyError(GithubException): class ConflictError(GithubException): pass +# Thrown when we receive 403 Rate Limit Exceeded from Github API class RateLimitExceeded(GithubException): pass +# Thrown when we're expected to sleep for longer than the max_sleep_seconds limit +class RateLimitSleepExceeded(GithubException): + pass + +# Thrown when 429 is received from Github API class TooManyRequests(GithubException): pass +# Thrown when repository is archived and extract_archived is not enabled +class ArchivedRepositoryError(GithubException): + pass + ERROR_CODE_EXCEPTION_MAPPING = { 301: { @@ -105,6 +125,13 @@ def raise_for_error(resp, source, stream, client, should_skip_404): except JSONDecodeError: response_json = {} + response_message = response_json.get('message', '') + + if error_code == 403 and RATE_LIMIT_EXCEED_MSG in response_message: + message = f"HTTP-error-code: 403, Error: {response_message}" + LOGGER.warning(message) + raise RateLimitExceeded() from None + if error_code == 404 and should_skip_404: # Add not accessible stream into list. client.not_accessible_repos.add(stream) @@ -116,6 +143,14 @@ def raise_for_error(resp, source, stream, client, should_skip_404): # Don't raise a NotFoundException return None + if error_code == 422 and PAGINATION_EXCEED_MSG in response_message: + message = f"HTTP-error-code: 422, Error: {response_message}. " \ + f"Please refer '{response_json.get('documentation_url')}' for more details." \ + "This is a known issue when the results exceed 40k and the last page is not full" \ + " (it will trim the results to get only the available by the API)." + LOGGER.warning(message) + return None + message = "HTTP-error-code: {}, Error: {}".format( error_code, ERROR_CODE_EXCEPTION_MAPPING.get(error_code, {}).get("message", "Unknown Error") if response_json == {} else response_json) @@ -130,19 +165,24 @@ def calculate_seconds(epoch): Calculate the seconds to sleep before making a new request. """ current = time.time() - return int(round((epoch - current), 0)) + return max(0, int(ceil(epoch - current))) -def rate_throttling(response, max_sleep_seconds): +def rate_throttling(response, max_sleep_seconds, min_remain_rate_limit): """ For rate limit errors, get the remaining time before retrying and calculate the time to sleep before making a new request. """ + if "Retry-After" in response.headers: + # handles the secondary rate limit + seconds_to_sleep = int(response.headers['Retry-After']) + LOGGER.info("Retry-After header found in response. Tap will retry the data collection after %s seconds.", seconds_to_sleep) + time.sleep(seconds_to_sleep) if 'X-RateLimit-Remaining' in response.headers: - if int(response.headers['X-RateLimit-Remaining']) == 0: - seconds_to_sleep = calculate_seconds(int(response.headers['X-RateLimit-Reset'])) + if int(response.headers['X-RateLimit-Remaining']) <= min_remain_rate_limit: + seconds_to_sleep = calculate_seconds(int(response.headers['X-RateLimit-Reset']) + 15) if seconds_to_sleep > max_sleep_seconds: message = "API rate limit exceeded, please try after {} seconds.".format(seconds_to_sleep) - raise RateLimitExceeded(message) from None + raise RateLimitSleepExceeded(message) from None LOGGER.info("API rate limit exceeded. Tap will retry the data collection after %s seconds.", seconds_to_sleep) time.sleep(seconds_to_sleep) @@ -160,8 +200,13 @@ def __init__(self, config): self.session = requests.Session() self.base_url = config['base_url'] if config.get('base_url') else DEFAULT_DOMAIN self.max_sleep_seconds = self.config.get('max_sleep_seconds', DEFAULT_SLEEP_SECONDS) + self.min_remain_rate_limit = self.config.get('min_remain_rate_limit', DEFAULT_MIN_REMAIN_RATE_LIMIT) self.set_auth_in_session() self.not_accessible_repos = set() + self.max_per_page = self.config.get('max_per_page', DEFAULT_MAX_PER_PAGE) + # Convert string 'true'/'false' to boolean, default to False + extract_archived_value = str(self.config.get('extract_archived', 'false')).lower() + self.extract_archived = extract_archived_value == 'true' def get_request_timeout(self): """ @@ -187,52 +232,88 @@ def set_auth_in_session(self): # pylint: disable=dangerous-default-value # During 'Timeout' error there is also possibility of 'ConnectionError', # hence added backoff for 'ConnectionError' too. - @backoff.on_exception(backoff.expo, (requests.Timeout, requests.ConnectionError, Server5xxError, TooManyRequests), max_tries=5, factor=2) - def authed_get(self, source, url, headers={}, stream="", should_skip_404 = True): + @backoff.on_exception(backoff.expo, (requests.Timeout, requests.ConnectionError, Server5xxError, TooManyRequests), + max_tries=5, factor=2) + @backoff.on_exception(backoff.expo, (BadCredentialsException, ), max_tries=3, factor=2) + @backoff.on_exception(backoff.constant, (RateLimitExceeded, ), interval=60, jitter=None, max_time=RATE_LIMIT_RETRY_MAX_TIME) + def authed_get_single_page(self, source, url, headers={}, stream="", should_skip_404 = True): """ Call rest API and return the response in case of status code 200. """ - with metrics.http_request_timer(source) as timer: + with metrics.http_request_timer(url) as timer: self.session.headers.update(headers) resp = self.session.request(method='get', url=url, timeout=self.get_request_timeout()) if resp.status_code != 200: raise_for_error(resp, source, stream, self, should_skip_404) timer.tags[metrics.Tag.http_status_code] = resp.status_code - rate_throttling(resp, self.max_sleep_seconds) - if resp.status_code == 404: + rate_throttling(resp, self.max_sleep_seconds, self.min_remain_rate_limit) + if resp.status_code == 404 or resp.status_code == 422: # Return an empty response body since we're not raising a NotFoundException - resp._content = b'{}' # pylint: disable=protected-access + resp._content = b'{}' # pylint: disable=protected-access return resp def authed_get_all_pages(self, source, url, headers={}, stream="", should_skip_404 = True): """ Fetch all pages of records and return them. """ - while True: - r = self.authed_get(source, url, headers, stream, should_skip_404) - yield r + next_url = self.prepare_url(url) + while next_url: + response = self.authed_get_single_page(source, next_url, headers, stream, should_skip_404) + yield response - # Fetch the next page if next found in the response. - if 'next' in r.links: - url = r.links['next']['url'] - else: - # Break the loop if all pages are fetched. - break + next_url = response.links.get('next', {}).get('url', None) + + def authed_get(self, source, url, headers={}, stream="", should_skip_404=True, single_page=False): + if single_page: + yield self.authed_get_single_page(source, url, headers, stream, should_skip_404) + else: + yield from self.authed_get_all_pages(source, url, headers, stream, should_skip_404) + + def prepare_url(self, url): + """ + Prepare the URL with some additional parameters + """ + prepared_request = PreparedRequest() + # Including max per page param + prepared_request.prepare_url(url, {'per_page': self.max_per_page}) + return prepared_request.url def verify_repo_access(self, url_for_repo, repo): """ Call rest API to verify that the user has sufficient permissions to access this repository. """ try: - self.authed_get("verifying repository access", url_for_repo) + self.authed_get_single_page("verifying repository access", url_for_repo, should_skip_404=False) except NotFoundException: # Throwing user-friendly error message as it checks token access message = "HTTP-error-code: 404, Error: Please check the repository name \'{}\' or you do not have sufficient permissions to access this repository.".format(repo) raise NotFoundException(message) from None + def check_repo_archived(self, repo): + """ + Check if a repository is archived and raise an error if extract_archived is not enabled. + + Args: + repo: Repository in 'org/repo' format + + Raises: + ArchivedRepositoryError: If repo is archived and extract_archived config is not true + """ + url = "{}/repos/{}".format(self.base_url, repo) + response = self.authed_get_single_page("checking repository archived status", url, should_skip_404=False) + repo_info = response.json() + + if repo_info.get('archived', False): + if not self.extract_archived: + message = "Repository '{}' is archived. To extract data from archived repositories, " \ + "set 'extract_archived' to 'true' in the config.".format(repo) + raise ArchivedRepositoryError(message) + LOGGER.warning("Repository '%s' is archived. Proceeding with extraction as 'extract_archived' is enabled.", repo) + def verify_access_for_repo(self): """ For all the repositories mentioned in the config, check the access for each repos. + Also checks if repositories are archived and fails if extract_archived is not enabled. """ repositories, org = self.extract_repos_from_config() # pylint: disable=unused-variable @@ -244,6 +325,9 @@ def verify_access_for_repo(self): # Verifying for Repo access self.verify_repo_access(url_for_repo, repo) + # Check if repository is archived + self.check_repo_archived(repo) + def extract_orgs_from_config(self): """ Extracts all organizations from the config @@ -331,6 +415,14 @@ def get_all_repos(self, organizations: list): repo ) + # Check if repository is archived (info already available in response) + if repo.get('archived', False): + if not self.extract_archived: + message = "Repository '{}' is archived. To extract data from archived repositories, " \ + "set 'extract_archived' to 'true' in the config.".format(repo_full_name) + raise ArchivedRepositoryError(message) + LOGGER.warning("Repository '%s' is archived. Proceeding with extraction as 'extract_archived' is enabled.", repo_full_name) + repos.append(repo_full_name) except NotFoundException: # Throwing user-friendly error message as it checks token access diff --git a/tap_github/schemas/commit_comments.json b/tap_github/schemas/commit_comments.json index 408448dc..3c8679ef 100644 --- a/tap_github/schemas/commit_comments.json +++ b/tap_github/schemas/commit_comments.json @@ -1,114 +1,117 @@ { - "type": ["null", "object"], - "properties": { - "html_url": { - "type": ["null", "string"] - }, - "url": { - "type": ["null", "string"] - }, - "id": { - "type": ["null", "number"] - }, - "node_id": { - "type": ["null", "string"] - }, - "body": { - "type": ["null", "string"] - }, - "path": { - "type": ["null", "string"] - }, - "position": { - "type": ["null", "number"] - }, - "line": { - "type": ["null", "number"] - }, - "commit_id": { - "type": ["null", "string"] - }, - "user": { - "type": ["null", "object"], - "properties": { - "name": { - "type": ["null", "string"] - }, - "email": { - "type": ["null", "string"] - }, - "login": { - "type": ["null", "string"] - }, - "id": { - "type": ["null", "number"] - }, - "node_id": { - "type": ["null", "string"] - }, - "avatar_url": { - "type": ["null", "string"] - }, - "gravatar_id": { - "type": ["null", "string"] - }, - "url": { - "type": ["null", "string"] - }, - "html_url": { - "type": ["null", "string"] - }, - "followers_url": { - "type": ["null", "string"] - }, - "following_url": { - "type": ["null", "string"] - }, - "gists_url": { - "type": ["null", "string"] - }, - "starred_url": { - "type": ["null", "string"] - }, - "subscriptions_url": { - "type": ["null", "string"] - }, - "organizations_url": { - "type": ["null", "string"] - }, - "repos_url": { - "type": ["null", "string"] - }, - "events_url": { - "type": ["null", "string"] - }, - "received_events_url": { - "type": ["null", "string"] - }, - "type": { - "type": ["null", "string"] - }, - "site_admin": { - "type": ["null", "boolean"] - }, - "starred_at": { - "type": ["null", "string"] - } + "type": ["null", "object"], + "properties": { + "html_url": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "number"] + }, + "node_id": { + "type": ["null", "string"] + }, + "body": { + "type": ["null", "string"] + }, + "path": { + "type": ["null", "string"] + }, + "position": { + "type": ["null", "number"] + }, + "line": { + "type": ["null", "number"] + }, + "commit_id": { + "type": ["null", "string"] + }, + "user": { + "type": ["null", "object"], + "properties": { + "name": { + "type": ["null", "string"] + }, + "email": { + "type": ["null", "string"] + }, + "login": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "number"] + }, + "node_id": { + "type": ["null", "string"] + }, + "avatar_url": { + "type": ["null", "string"] + }, + "gravatar_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "followers_url": { + "type": ["null", "string"] + }, + "following_url": { + "type": ["null", "string"] + }, + "gists_url": { + "type": ["null", "string"] + }, + "starred_url": { + "type": ["null", "string"] + }, + "subscriptions_url": { + "type": ["null", "string"] + }, + "organizations_url": { + "type": ["null", "string"] + }, + "repos_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "received_events_url": { + "type": ["null", "string"] + }, + "type": { + "type": ["null", "string"] + }, + "site_admin": { + "type": ["null", "boolean"] + }, + "starred_at": { + "type": ["null", "string"] } - }, - "created_at": { - "type": ["null", "string"], - "format": "date-time" - }, - "updated_at": { - "type": ["null", "string"], - "format": "date-time" - }, - "author_association": { - "type": ["null", "string"] - }, - "reactions": { - "$ref": "shared/reactions.json#/" } + }, + "created_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "updated_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "author_association": { + "type": ["null", "string"] + }, + "reactions": { + "$ref": "shared/reactions.json#/" + }, + "_sdc_repository": { + "type": ["string"] } + } } diff --git a/tap_github/schemas/open_issues.json b/tap_github/schemas/open_issues.json new file mode 100644 index 00000000..93365708 --- /dev/null +++ b/tap_github/schemas/open_issues.json @@ -0,0 +1,276 @@ +{ + "properties": { + "state": { + "type": ["null", "string"] + }, + "state_reason": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "labels": { + "type": ["null", "array"], + "items": { + "type": ["null", "object"], + "properties": { + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "description": { + "type": ["null", "string"] + }, + "color": { + "type": ["null", "string"] + }, + "default": { + "type": ["null", "boolean"] + } + } + } + }, + "repository_url": { + "type": ["null", "string"] + }, + "number": { + "type": ["null", "integer"] + }, + "closed_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "labels_url": { + "type": ["null", "string"] + }, + "title": { + "type": ["null", "string"] + }, + "assignee": { + "$ref": "shared/user.json#/" + }, + "assignees": { + "type": ["null", "array"], + "items": { + "$ref": "shared/user.json#/" + } + }, + "milestone": { + "type": ["null", "object"], + "properties": { + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "labels_url": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "number": { + "type": ["null", "integer"] + }, + "state": { + "type": ["null", "string"] + }, + "title": { + "type": ["null", "string"] + }, + "description": { + "type": ["null", "string"] + }, + "creator": { + "$ref": "shared/user.json#/" + }, + "open_issues": { + "type": ["null", "integer"] + }, + "closed_issues": { + "type": ["null", "integer"] + }, + "created_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "updated_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "closed_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "due_on": { + "type": ["null", "string"], + "format": "date-time" + } + } + }, + "reactions": { + "$ref": "shared/reactions.json#/" + }, + "active_lock_reason": { + "type": ["null", "string"] + }, + "body_html": { + "type": ["null", "string"] + }, + "performed_via_github_app": { + "$ref": "shared/performed_via_github_app.json#/" + }, + "timeline_url": { + "type": ["null", "string"] + }, + "closed_by": { + "type": ["null", "object"], + "properties": { + "name": { + "type": ["null", "string"] + }, + "email": { + "type": ["null", "string"] + }, + "login": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + }, + "avatar_url": { + "type": ["null", "string"] + }, + "gravatar_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "followers_url": { + "type": ["null", "string"] + }, + "following_url": { + "type": ["null", "string"] + }, + "gists_url": { + "type": ["null", "string"] + }, + "starred_url": { + "type": ["null", "string"] + }, + "subscriptions_url": { + "type": ["null", "string"] + }, + "organizations_url": { + "type": ["null", "string"] + }, + "repos_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "received_events_url": { + "type": ["null", "string"] + }, + "type": { + "type": ["null", "string"] + }, + "site_admin": { + "type": ["null", "boolean"] + }, + "starred_at": { + "type": ["null", "string"] + } + } + }, + "updated_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "html_url": { + "type": ["null", "string"] + }, + "author_association": { + "type": ["null", "string"] + }, + "locked": { + "type": ["null", "boolean"] + }, + "events_url": { + "type": ["null", "string"] + }, + "pull_request": { + "properties": { + "diff_url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "patch_url": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "merged_at": { + "type": ["null", "string"], + "format": "date-time" + } + }, + "type": ["null", "object"] + }, + "node_id": { + "type": ["null", "string"] + }, + "body": { + "type": ["null", "string"] + }, + "comments": { + "type": ["null", "integer"] + }, + "created_at": { + "type": ["null", "string"], + "format": "date-time" + }, + "_sdc_repository": { + "type": ["string"] + }, + "user": { + "$ref": "shared/user.json#/" + }, + "id": { + "type": ["null", "integer"] + }, + "comments_url": { + "type": ["null", "string"] + }, + "body_text": { + "type": ["null", "string"] + }, + "draft": { + "type": ["null", "boolean"] + } + }, + "type": ["null", "object"] +} diff --git a/tap_github/schemas/releases.json b/tap_github/schemas/releases.json index b903a026..fc97ced2 100644 --- a/tap_github/schemas/releases.json +++ b/tap_github/schemas/releases.json @@ -186,8 +186,7 @@ "format": "date-time" }, "discussion_url": { - "type": ["null", "string"], - "format": "date-time" + "type": ["null", "string"] } } } \ No newline at end of file diff --git a/tap_github/schemas/repo_forked_compares.json b/tap_github/schemas/repo_forked_compares.json new file mode 100644 index 00000000..70ad5d02 --- /dev/null +++ b/tap_github/schemas/repo_forked_compares.json @@ -0,0 +1,77 @@ +{ + "type": ["null", "object"], + "properties": { + "_sdc_repository": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "full_name": { + "type": ["null", "string"] + }, + "owner_login": { + "type": ["null", "string"] + }, + "default_branch": { + "type": ["null", "string"] + }, + "fork_name": { + "type": ["null", "string"] + }, + "fork_owner_login": { + "type": ["null", "string"] + }, + "fork_default_branch": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "permalink_url": { + "type": ["null", "string"] + }, + "diff_url": { + "type": ["null", "string"] + }, + "patch_url": { + "type": ["null", "string"] + }, + "base_commit": { + "type": ["null", "object"] + }, + "merge_base_commit": { + "type": ["null", "object"] + }, + "status": { + "type": ["null", "string"] + }, + "ahead_by": { + "type": ["null", "number"] + }, + "behind_by": { + "type": ["null", "number"] + }, + "total_commits": { + "type": ["null", "number"] + }, + "commits": { + "type": ["null", "array"], + "items": { + "type": ["null", "object"] + } + }, + "files": { + "type": ["null", "array"], + "items": { + "type": ["null", "object"] + } + }, + "error_message": { + "type": ["null", "string"] + } + } +} diff --git a/tap_github/schemas/repo_forked_parents.json b/tap_github/schemas/repo_forked_parents.json new file mode 100644 index 00000000..31ff6129 --- /dev/null +++ b/tap_github/schemas/repo_forked_parents.json @@ -0,0 +1,260 @@ +{ + "type": ["null", "object"], + "properties": { + "_sdc_repository": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "number"] + }, + "node_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "full_name": { + "type": ["null", "string"] + }, + "private": { + "type": ["null", "boolean"] + }, + "owner": { + "$ref": "shared/user.json#/" + }, + "description": { + "type": ["null", "string"] + }, + "fork": { + "type": ["null", "boolean"] + }, + "fork_name": { + "type": ["null", "string"] + }, + "fork_owner_login": { + "type": ["null", "string"] + }, + "fork_default_branch": { + "type": ["null", "string"] + }, + "keys_url": { + "type": ["null", "string"] + }, + "collaborators_url": { + "type": ["null", "string"] + }, + "teams_url": { + "type": ["null", "string"] + }, + "hooks_url": { + "type": ["null", "string"] + }, + "issue_events_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "assignees_url": { + "type": ["null", "string"] + }, + "branches_url": { + "type": ["null", "string"] + }, + "tags_url": { + "type": ["null", "string"] + }, + "blobs_url": { + "type": ["null", "string"] + }, + "git_tags_url": { + "type": ["null", "string"] + }, + "git_refs_url": { + "type": ["null", "string"] + }, + "trees_url": { + "type": ["null", "string"] + }, + "statuses_url": { + "type": ["null", "string"] + }, + "languages_url": { + "type": ["null", "string"] + }, + "stargazers_url": { + "type": ["null", "string"] + }, + "contributors_url": { + "type": ["null", "string"] + }, + "subscribers_url": { + "type": ["null", "string"] + }, + "subscription_url": { + "type": ["null", "string"] + }, + "commits_url": { + "type": ["null", "string"] + }, + "git_commits_url": { + "type": ["null", "string"] + }, + "comments_url": { + "type": ["null", "string"] + }, + "issue_comment_url": { + "type": ["null", "string"] + }, + "contents_url": { + "type": ["null", "string"] + }, + "compare_url": { + "type": ["null", "string"] + }, + "merges_url": { + "type": ["null", "string"] + }, + "archive_url": { + "type": ["null", "string"] + }, + "downloads_url": { + "type": ["null", "string"] + }, + "issues_url": { + "type": ["null", "string"] + }, + "pulls_url": { + "type": ["null", "string"] + }, + "milestones_url": { + "type": ["null", "string"] + }, + "notifications_url": { + "type": ["null", "string"] + }, + "labels_url": { + "type": ["null", "string"] + }, + "releases_url": { + "type": ["null", "string"] + }, + "deployments_url": { + "type": ["null", "string"] + }, + "created_at": { + "type": ["null", "string"] + }, + "updated_at": { + "type": ["null", "string"] + }, + "pushed_at": { + "type": ["null", "string"] + }, + "git_url": { + "type": ["null", "string"] + }, + "ssh_url": { + "type": ["null", "string"] + }, + "clone_url": { + "type": ["null", "string"] + }, + "svn_url": { + "type": ["null", "string"] + }, + "homepage": { + "type": ["null", "string"] + }, + "size": { + "type": ["null", "number"] + }, + "stargazers_count": { + "type": ["null", "number"] + }, + "watchers_count": { + "type": ["null", "number"] + }, + "language": { + "type": ["null", "string"] + }, + "has_issues": { + "type": ["null", "boolean"] + }, + "has_projects": { + "type": ["null", "boolean"] + }, + "has_downloads": { + "type": ["null", "boolean"] + }, + "has_wiki": { + "type": ["null", "boolean"] + }, + "has_pages": { + "type": ["null", "boolean"] + }, + "has_discussions": { + "type": ["null", "boolean"] + }, + "forks_count": { + "type": ["null", "number"] + }, + "mirror_url": { + "type": ["null", "string"] + }, + "archived": { + "type": ["null", "boolean"] + }, + "disabled": { + "type": ["null", "boolean"] + }, + "open_issues_count": { + "type": ["null", "number"] + }, + "license": { + "type": ["null", "object"] + }, + "allow_forking": { + "type": ["null", "boolean"] + }, + "is_template": { + "type": ["null", "boolean"] + }, + "web_commit_signoff_required": { + "type": ["null", "boolean"] + }, + "topics": { + "type": ["null", "array"], + "items": { + "type": ["string"] + } + }, + "visibility": { + "type": ["null", "string"] + }, + "forks": { + "type": ["null", "number"] + }, + "open_issues": { + "type": ["null", "number"] + }, + "watchers": { + "type": ["null", "number"] + }, + "default_branch": { + "type": ["null", "string"] + }, + "permissions": { + "$ref": "shared/pull_permissions.json#/" + }, + "parent": { + "$ref": "shared/repos.json#/" + } + } +} diff --git a/tap_github/schemas/repos.json b/tap_github/schemas/repos.json new file mode 100644 index 00000000..7954ed2e --- /dev/null +++ b/tap_github/schemas/repos.json @@ -0,0 +1,248 @@ +{ + "type": ["null", "object"], + "properties": { + "_sdc_repository": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "number"] + }, + "node_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "full_name": { + "type": ["null", "string"] + }, + "private": { + "type": ["null", "boolean"] + }, + "owner": { + "$ref": "shared/user.json#/" + }, + "description": { + "type": ["null", "string"] + }, + "fork": { + "type": ["null", "boolean"] + }, + "keys_url": { + "type": ["null", "string"] + }, + "collaborators_url": { + "type": ["null", "string"] + }, + "teams_url": { + "type": ["null", "string"] + }, + "hooks_url": { + "type": ["null", "string"] + }, + "issue_events_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "assignees_url": { + "type": ["null", "string"] + }, + "branches_url": { + "type": ["null", "string"] + }, + "tags_url": { + "type": ["null", "string"] + }, + "blobs_url": { + "type": ["null", "string"] + }, + "git_tags_url": { + "type": ["null", "string"] + }, + "git_refs_url": { + "type": ["null", "string"] + }, + "trees_url": { + "type": ["null", "string"] + }, + "statuses_url": { + "type": ["null", "string"] + }, + "languages_url": { + "type": ["null", "string"] + }, + "stargazers_url": { + "type": ["null", "string"] + }, + "contributors_url": { + "type": ["null", "string"] + }, + "subscribers_url": { + "type": ["null", "string"] + }, + "subscription_url": { + "type": ["null", "string"] + }, + "commits_url": { + "type": ["null", "string"] + }, + "git_commits_url": { + "type": ["null", "string"] + }, + "comments_url": { + "type": ["null", "string"] + }, + "issue_comment_url": { + "type": ["null", "string"] + }, + "contents_url": { + "type": ["null", "string"] + }, + "compare_url": { + "type": ["null", "string"] + }, + "merges_url": { + "type": ["null", "string"] + }, + "archive_url": { + "type": ["null", "string"] + }, + "downloads_url": { + "type": ["null", "string"] + }, + "issues_url": { + "type": ["null", "string"] + }, + "pulls_url": { + "type": ["null", "string"] + }, + "milestones_url": { + "type": ["null", "string"] + }, + "notifications_url": { + "type": ["null", "string"] + }, + "labels_url": { + "type": ["null", "string"] + }, + "releases_url": { + "type": ["null", "string"] + }, + "deployments_url": { + "type": ["null", "string"] + }, + "created_at": { + "type": ["null", "string"] + }, + "updated_at": { + "type": ["null", "string"] + }, + "pushed_at": { + "type": ["null", "string"] + }, + "git_url": { + "type": ["null", "string"] + }, + "ssh_url": { + "type": ["null", "string"] + }, + "clone_url": { + "type": ["null", "string"] + }, + "svn_url": { + "type": ["null", "string"] + }, + "homepage": { + "type": ["null", "string"] + }, + "size": { + "type": ["null", "number"] + }, + "stargazers_count": { + "type": ["null", "number"] + }, + "watchers_count": { + "type": ["null", "number"] + }, + "language": { + "type": ["null", "string"] + }, + "has_issues": { + "type": ["null", "boolean"] + }, + "has_projects": { + "type": ["null", "boolean"] + }, + "has_downloads": { + "type": ["null", "boolean"] + }, + "has_wiki": { + "type": ["null", "boolean"] + }, + "has_pages": { + "type": ["null", "boolean"] + }, + "has_discussions": { + "type": ["null", "boolean"] + }, + "forks_count": { + "type": ["null", "number"] + }, + "mirror_url": { + "type": ["null", "string"] + }, + "archived": { + "type": ["null", "boolean"] + }, + "disabled": { + "type": ["null", "boolean"] + }, + "open_issues_count": { + "type": ["null", "number"] + }, + "license": { + "type": ["null", "object"] + }, + "allow_forking": { + "type": ["null", "boolean"] + }, + "is_template": { + "type": ["null", "boolean"] + }, + "web_commit_signoff_required": { + "type": ["null", "boolean"] + }, + "topics": { + "type": ["null", "array"], + "items": { + "type": ["string"] + } + }, + "visibility": { + "type": ["null", "string"] + }, + "forks": { + "type": ["null", "number"] + }, + "open_issues": { + "type": ["null", "number"] + }, + "watchers": { + "type": ["null", "number"] + }, + "default_branch": { + "type": ["null", "string"] + }, + "permissions": { + "$ref": "shared/pull_permissions.json#/" + } + } +} diff --git a/tap_github/schemas/shared/repos.json b/tap_github/schemas/shared/repos.json new file mode 100644 index 00000000..7954ed2e --- /dev/null +++ b/tap_github/schemas/shared/repos.json @@ -0,0 +1,248 @@ +{ + "type": ["null", "object"], + "properties": { + "_sdc_repository": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "number"] + }, + "node_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "full_name": { + "type": ["null", "string"] + }, + "private": { + "type": ["null", "boolean"] + }, + "owner": { + "$ref": "shared/user.json#/" + }, + "description": { + "type": ["null", "string"] + }, + "fork": { + "type": ["null", "boolean"] + }, + "keys_url": { + "type": ["null", "string"] + }, + "collaborators_url": { + "type": ["null", "string"] + }, + "teams_url": { + "type": ["null", "string"] + }, + "hooks_url": { + "type": ["null", "string"] + }, + "issue_events_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "assignees_url": { + "type": ["null", "string"] + }, + "branches_url": { + "type": ["null", "string"] + }, + "tags_url": { + "type": ["null", "string"] + }, + "blobs_url": { + "type": ["null", "string"] + }, + "git_tags_url": { + "type": ["null", "string"] + }, + "git_refs_url": { + "type": ["null", "string"] + }, + "trees_url": { + "type": ["null", "string"] + }, + "statuses_url": { + "type": ["null", "string"] + }, + "languages_url": { + "type": ["null", "string"] + }, + "stargazers_url": { + "type": ["null", "string"] + }, + "contributors_url": { + "type": ["null", "string"] + }, + "subscribers_url": { + "type": ["null", "string"] + }, + "subscription_url": { + "type": ["null", "string"] + }, + "commits_url": { + "type": ["null", "string"] + }, + "git_commits_url": { + "type": ["null", "string"] + }, + "comments_url": { + "type": ["null", "string"] + }, + "issue_comment_url": { + "type": ["null", "string"] + }, + "contents_url": { + "type": ["null", "string"] + }, + "compare_url": { + "type": ["null", "string"] + }, + "merges_url": { + "type": ["null", "string"] + }, + "archive_url": { + "type": ["null", "string"] + }, + "downloads_url": { + "type": ["null", "string"] + }, + "issues_url": { + "type": ["null", "string"] + }, + "pulls_url": { + "type": ["null", "string"] + }, + "milestones_url": { + "type": ["null", "string"] + }, + "notifications_url": { + "type": ["null", "string"] + }, + "labels_url": { + "type": ["null", "string"] + }, + "releases_url": { + "type": ["null", "string"] + }, + "deployments_url": { + "type": ["null", "string"] + }, + "created_at": { + "type": ["null", "string"] + }, + "updated_at": { + "type": ["null", "string"] + }, + "pushed_at": { + "type": ["null", "string"] + }, + "git_url": { + "type": ["null", "string"] + }, + "ssh_url": { + "type": ["null", "string"] + }, + "clone_url": { + "type": ["null", "string"] + }, + "svn_url": { + "type": ["null", "string"] + }, + "homepage": { + "type": ["null", "string"] + }, + "size": { + "type": ["null", "number"] + }, + "stargazers_count": { + "type": ["null", "number"] + }, + "watchers_count": { + "type": ["null", "number"] + }, + "language": { + "type": ["null", "string"] + }, + "has_issues": { + "type": ["null", "boolean"] + }, + "has_projects": { + "type": ["null", "boolean"] + }, + "has_downloads": { + "type": ["null", "boolean"] + }, + "has_wiki": { + "type": ["null", "boolean"] + }, + "has_pages": { + "type": ["null", "boolean"] + }, + "has_discussions": { + "type": ["null", "boolean"] + }, + "forks_count": { + "type": ["null", "number"] + }, + "mirror_url": { + "type": ["null", "string"] + }, + "archived": { + "type": ["null", "boolean"] + }, + "disabled": { + "type": ["null", "boolean"] + }, + "open_issues_count": { + "type": ["null", "number"] + }, + "license": { + "type": ["null", "object"] + }, + "allow_forking": { + "type": ["null", "boolean"] + }, + "is_template": { + "type": ["null", "boolean"] + }, + "web_commit_signoff_required": { + "type": ["null", "boolean"] + }, + "topics": { + "type": ["null", "array"], + "items": { + "type": ["string"] + } + }, + "visibility": { + "type": ["null", "string"] + }, + "forks": { + "type": ["null", "number"] + }, + "open_issues": { + "type": ["null", "number"] + }, + "watchers": { + "type": ["null", "number"] + }, + "default_branch": { + "type": ["null", "string"] + }, + "permissions": { + "$ref": "shared/pull_permissions.json#/" + } + } +} diff --git a/tap_github/streams.py b/tap_github/streams.py index 278dd05a..c5bcb025 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -2,6 +2,8 @@ import singer from singer import (metrics, bookmarks, metadata) +from tap_github.client import UnprocessableError + LOGGER = singer.get_logger() DATE_FORMAT = '%Y-%m-%dT%H:%M:%SZ' @@ -68,6 +70,7 @@ class Stream: use_repository = False headers = {'Accept': '*/*'} parent = None + single_page = False def build_url(self, base_url, repo_path, bookmark): """ @@ -153,10 +156,11 @@ def get_child_records(self, stream_catalog = get_schema(catalog, child_object.tap_stream_id) with metrics.record_counter(child_object.tap_stream_id) as counter: - for response in client.authed_get_all_pages( + for response in client.authed_get( child_object.tap_stream_id, child_full_url, - stream = child_object.tap_stream_id + stream = child_object.tap_stream_id, + single_page = child_object.single_page ): records = response.json() extraction_time = singer.utils.now() @@ -196,6 +200,17 @@ def get_child_records(self, singer.write_record(child_object.tap_stream_id, rec, time_extracted=extraction_time) + # Loop through each child and nested child in the parent and fetch all the child records. + for nested_child in child_object.children: + if nested_child in stream_to_sync: + # Collect id of child record to pass in the API of its sub-child. + child_id = tuple(records.get(key) for key in STREAMS[nested_child]().id_keys) + # Here, grand_parent_id is the id of 1st level parent(main parent) which is required to + # pass in the API of the current child's sub-child. + child_object.get_child_records(client, catalog, nested_child, child_id, repo_path, state, start_date, + bookmark_dttm, stream_to_sync, selected_stream_ids, grand_parent_id, + records) + # pylint: disable=unnecessary-pass def add_fields_at_1st_level(self, record, parent_record = None): """ @@ -223,11 +238,12 @@ def sync_endpoint(self, stream_catalog = get_schema(catalog, self.tap_stream_id) with metrics.record_counter(self.tap_stream_id) as counter: - for response in client.authed_get_all_pages( + for response in client.authed_get( self.tap_stream_id, full_url, self.headers, - stream = self.tap_stream_id + stream = self.tap_stream_id, + single_page = self.single_page ): records = response.json() extraction_time = singer.utils.now() @@ -294,11 +310,12 @@ def sync_endpoint(self, stream_catalog = get_schema(catalog, self.tap_stream_id) with metrics.record_counter(self.tap_stream_id) as counter: - for response in client.authed_get_all_pages( + for response in client.authed_get( self.tap_stream_id, full_url, self.headers, - stream = self.tap_stream_id + stream = self.tap_stream_id, + single_page = self.single_page ): records = response.json() extraction_time = singer.utils.now() @@ -381,10 +398,11 @@ def sync_endpoint(self, parent_bookmark_value = bookmark_value record_counter = 0 with metrics.record_counter(self.tap_stream_id) as counter: - for response in client.authed_get_all_pages( + for response in client.authed_get( self.tap_stream_id, full_url, - stream = self.tap_stream_id + stream = self.tap_stream_id, + single_page = self.single_page ): records = response.json() extraction_time = singer.utils.now() @@ -648,6 +666,18 @@ class Issues(IncrementalOrderedStream): filter_param = True path = "issues?state=all&sort=updated&direction=desc" + +class OpenIssues(FullTableStream): + ''' + https://docs.github.com/en/rest/issues/issues#list-repository-issues + ''' + tap_stream_id = "open_issues" + replication_method = "FULL_TABLE" + key_properties = ["id"] + filter_param = False + path = "issues?state=open" + + class Assignees(FullTableStream): ''' https://docs.github.com/en/rest/issues/assignees#list-assignees @@ -713,7 +743,7 @@ class IssueMilestones(IncrementalOrderedStream): replication_method = "INCREMENTAL" replication_keys = "updated_at" key_properties = ["id"] - path = "milestones?direction=desc&sort=updated_at" + path = "milestones?state=all&direction=desc&sort=updated_at" class Collaborators(FullTableStream): ''' @@ -741,6 +771,126 @@ def add_fields_at_1st_level(self, record, parent_record = None): record['user_id'] = record['user']['id'] +class Repos(FullTableStream): + ''' + https://docs.github.com/en/rest/repos/repos#list-organization-repositories + ''' + tap_stream_id = "repos" + replication_method = "FULL_TABLE" + key_properties = ["id"] + use_organization = True + path = "orgs/{}/repos?per_page=100" + has_children = True + children= ["repo_forked_parents"] + pk_child_fields = ['id'] + + def get_child_records(self, + client, + catalog, + child_stream, + grand_parent_id, + repo_path, + state, + start_date, + bookmark_dttm, + stream_to_sync, + selected_stream_ids, + parent_id=None, + parent_record=None): + if child_stream == 'repo_forked_parents' and parent_record and not parent_record.get('fork'): + # Skip fetching child records for non-forked repositories. + return + super().get_child_records(client, catalog, child_stream, grand_parent_id, repo_path, state, start_date, + bookmark_dttm, stream_to_sync, selected_stream_ids, parent_id, parent_record) + + +class RepoForkedParents(FullTableStream): + ''' + Get parent repositories of a forked repository. + https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#get-a-repository + ''' + tap_stream_id = "repo_forked_parents" + replication_method = "FULL_TABLE" + key_properties = ["id"] + use_organization = True + path = "repos/{}/{}" + parent = 'repos' + id_keys = ['name', 'default_branch'] + has_children = True + children = ["repo_forked_compares"] + pk_child_fields = ['id'] + + def add_fields_at_1st_level(self, record, parent_record = None): + """ + Add fields in the record explicitly at the 1st level of JSON. + """ + record['fork_name'] = record['parent']['name'] + record['fork_owner_login'] = record['parent']['owner']['login'] + record['fork_default_branch'] = record['parent']['default_branch'] + + def get_child_records(self, + client, + catalog, + child_stream, + grand_parent_id, + repo_path, + state, + start_date, + bookmark_dttm, + stream_to_sync, + selected_stream_ids, + parent_id=None, + parent_record=None): + try: + super().get_child_records(client, catalog, child_stream, grand_parent_id, repo_path, state, start_date, + bookmark_dttm, stream_to_sync, selected_stream_ids, parent_id, parent_record) + except UnprocessableError as e: + error_message = str(e) + if "HTTP-error-code: 422" in error_message and "Sorry, this diff is taking too long to generate" in error_message: + child_object = STREAMS[child_stream]() + record = {'_sdc_repository': repo_path, 'error_message': error_message} + child_object.add_fields_at_1st_level(record=record, parent_record=parent_record) + with singer.Transformer() as transformer: + if child_object.tap_stream_id in selected_stream_ids: + singer.write_record(child_object.tap_stream_id, record, time_extracted=singer.utils.now()) + + LOGGER.warning(f'Can\'t compare the forked repository ({selected_stream_ids}) with the parent repository. Error: {error_message}') + else: + raise e + + + +class RepoForkedCompares(FullTableStream): + ''' + Compare the repo with the default branch of the parent repository. + https://docs.github.com/en/rest/commits/commits?apiVersion=2022-11-28#compare-two-commits + ''' + tap_stream_id = "repo_forked_compares" + replication_method = "FULL_TABLE" + key_properties = ["full_name"] + use_organization = True + path = "repos/{}/{}/compare/{}...{}:{}" + parent = 'repo_forked_parents' + id_keys = ['fork_owner_login', 'fork_default_branch'] + single_page = True # No pagination required for this stream. + + def add_fields_at_1st_level(self, record, parent_record = None): + """ + Add fields in the record explicitly at the 1st level of JSON. + """ + record['name'] = parent_record['name'] + record['owner_login'] = parent_record['owner']['login'] + record['full_name'] = parent_record['full_name'] + record['default_branch'] = parent_record['default_branch'] + record['fork_owner_login'] = parent_record['fork_owner_login'] + record['fork_name'] = parent_record['fork_name'] + record['fork_default_branch'] = parent_record['fork_default_branch'] + if 'status' not in record and 'error_message' not in record: + record['error_message'] = ('Cannot retrieve ahead/behind information for this branch. ' + 'It may happen if the parent repository has been deleted or ' + 'no common ancestor between the default branches.') + + # Dictionary of the stream classes STREAMS = { "commits": Commits, @@ -753,6 +903,7 @@ def add_fields_at_1st_level(self, record, parent_record = None): "events": Events, "commit_comments": CommitComments, "issue_milestones": IssueMilestones, + "open_issues": OpenIssues, "projects": Projects, "project_columns": ProjectColumns, "project_cards": ProjectCards, @@ -764,5 +915,8 @@ def add_fields_at_1st_level(self, record, parent_record = None): "team_members": TeamMembers, "team_memberships": TeamMemberships, "collaborators": Collaborators, - "stargazers": StarGazers + "stargazers": StarGazers, + "repos": Repos, + "repo_forked_parents": RepoForkedParents, + "repo_forked_compares": RepoForkedCompares } diff --git a/tap_github/sync.py b/tap_github/sync.py index a83610ad..d0e4936b 100644 --- a/tap_github/sync.py +++ b/tap_github/sync.py @@ -4,7 +4,7 @@ from tap_github.streams import STREAMS LOGGER = singer.get_logger() -STREAM_TO_SYNC_FOR_ORGS = ['teams', 'team_members', 'team_memberships'] +STREAM_TO_SYNC_FOR_ORGS = ['teams', 'team_members', 'team_memberships', 'repos', 'repo_forked_parents', 'repo_forked_compares'] def get_selected_streams(catalog): ''' diff --git a/tests/unittests/test_custom_domain.py b/tests/unittests/test_custom_domain.py index 139b2426..3a517786 100644 --- a/tests/unittests/test_custom_domain.py +++ b/tests/unittests/test_custom_domain.py @@ -12,8 +12,8 @@ def test_config_without_domain(self, mock_verify_access): """ Test if the domain is not given in the config """ - mock_config = {'repository': 'singer-io/test-repo', "access_token": ""} - test_client = GithubClient(mock_config) + config = {'repository': 'singer-io/test-repo', "access_token": ""} + test_client = GithubClient(config) # Verify domain in client is default self.assertEqual(test_client.base_url, DEFAULT_DOMAIN) @@ -22,8 +22,24 @@ def test_config_with_domain(self, mock_verify_access): """ Test if the domain is given in the config """ - mock_config = {'repository': 'singer-io/test-repo', "base_url": "http://CUSTOM-git.com", "access_token": ""} - test_client = GithubClient(mock_config) + config = {'repository': 'singer-io/test-repo', "base_url": "http://CUSTOM-git.com", "access_token": ""} + test_client = GithubClient(config) # Verify domain in client is from config - self.assertEqual(test_client.base_url, mock_config["base_url"]) + self.assertEqual(test_client.base_url, config["base_url"]) + + def test_prepare_url(self, mock_verify_access): + """ + Test if the correct params are added to url + """ + config = {'repository': 'singer-io/test-repo', "base_url": "http://CUSTOM-git.com", "access_token": ""} + test_client = GithubClient(config) + + # Verify if per_page param was added with default value + self.assertEqual(test_client.prepare_url(test_client.base_url), "http://custom-git.com/?per_page=100") + self.assertEqual(test_client.prepare_url('http://CUSTOM-git.com/?q=query'), 'http://custom-git.com/?q=query&per_page=100') + + # Verify if per_page param was added as expected + config["max_per_page"] = 35 + test_client2 = GithubClient(config) + self.assertEqual(test_client2.prepare_url(test_client2.base_url), "http://custom-git.com/?per_page=35") diff --git a/tests/unittests/test_exception_handling.py b/tests/unittests/test_exception_handling.py index 8c381054..27f85be3 100644 --- a/tests/unittests/test_exception_handling.py +++ b/tests/unittests/test_exception_handling.py @@ -63,7 +63,7 @@ def test_json_decoder_error(self, mocked_parse_args, mocked_request, mock_verify @parameterized.expand([ [400, "The request is missing or has a bad parameter.", BadRequestException, '', {}, 1], - [401, "Invalid authorization credentials.", BadCredentialsException, '', {}, 1], + [401, "Invalid authorization credentials.", BadCredentialsException, '', {}, 3], [403, "User doesn't have permission to access the resource.", AuthException, '', {}, 1], [500, "An error has occurred at Github's end.", InternalServerError, '', {}, 5], [301, "The resource you are looking for is moved to another URL.", tap_github.client.MovedPermanentlyError, '', {}, 1], @@ -83,7 +83,7 @@ def test_error_message_and_call_count(self, mocked_parse_args, mocked_request, m expected_error_message = "HTTP-error-code: {}, Error: {}".format(erro_code, error_msg) with self.assertRaises(error_class) as e: - test_client.authed_get("", "") + test_client.authed_get_single_page("", "") # Verifying the message formed for the custom exception self.assertEqual(str(e.exception), expected_error_message) @@ -101,7 +101,7 @@ def test_skip_404_error(self, mock_logger, mocked_parse_args, mocked_request, m expected_message = "HTTP-error-code: 404, Error: The resource you have specified cannot be found. Alternatively the access_token is not valid for the resource. Please refer '{}' for more details.".format(json.get("documentation_url")) test_client = GithubClient(self.config) - test_client.authed_get("", "") + test_client.authed_get_single_page("", "") # Verifying the message formed for the custom exception self.assertEqual(mock_logger.mock_calls[0], mock.call(expected_message)) diff --git a/tests/unittests/test_get_all_repos.py b/tests/unittests/test_get_all_repos.py index 9235acad..fd332381 100644 --- a/tests/unittests/test_get_all_repos.py +++ b/tests/unittests/test_get_all_repos.py @@ -90,38 +90,39 @@ def test_multiple_organizations(self, mocked_authed_get_all_pages, mocked_verify self.assertListEqual(expected_repositories, side_effect) @mock.patch('tap_github.client.GithubClient.verify_repo_access') -@mock.patch('tap_github.client.GithubClient.authed_get') +@mock.patch('tap_github.client.GithubClient.authed_get_single_page') class TestAuthedGetAllPages(unittest.TestCase): """ Test `authed_get_all_pages` method from client. """ - config = {"access_token": "", "repository": "test-org/repo1"} + config = {"access_token": "", "repository": "test-org/repo1", "max_per_page": 100} - def test_for_one_page(self, mock_auth_get, mock_verify_access): + def test_for_one_page(self, mock_auth_get_single_page, mock_verify_access): """Verify `authed_get` is called only once if one page is available.""" test_client = GithubClient(self.config) - mock_auth_get.return_value = MockResponse({}) + mock_auth_get_single_page.return_value = MockResponse({}) - list(test_client.authed_get_all_pages("", "mock_url", {})) + list(test_client.authed_get_all_pages("", "http://mock_url", {})) # Verify `auth_get` call count - self.assertEqual(mock_auth_get.call_count, 1) + self.assertEqual(mock_auth_get_single_page.call_count, 1) - def test_for_multiple_pages(self, mock_auth_get, mock_verify_access): + def test_for_multiple_pages(self, mock_auth_get_single_page, mock_verify_access): """Verify `authed_get` is called equal number times as pages available.""" test_client = GithubClient(self.config) - mock_auth_get.side_effect = [MockResponse({"next": {"url": "mock_url_2"}}),MockResponse({"next": {"url": "mock_url_3"}}),MockResponse({})] + mock_auth_get_single_page.side_effect = [MockResponse({"next": {"url": "http://mock_url_2/?per_page=100"}}), + MockResponse({"next": {"url": "http://mock_url_3/?per_page=100"}}),MockResponse({})] - list(test_client.authed_get_all_pages("", "mock_url_1", {})) + list(test_client.authed_get_all_pages("", "http://mock_url_1", {})) # Verify `auth_get` call count - self.assertEqual(mock_auth_get.call_count, 3) + self.assertEqual(mock_auth_get_single_page.call_count, 3) # Verify `auth_get` calls with expected url - self.assertEqual(mock_auth_get.mock_calls[0], mock.call("", "mock_url_1", {}, '', True)) - self.assertEqual(mock_auth_get.mock_calls[1], mock.call("", "mock_url_2", {}, '', True)) - self.assertEqual(mock_auth_get.mock_calls[2], mock.call("", "mock_url_3", {}, '', True)) + self.assertEqual(mock_auth_get_single_page.mock_calls[0], mock.call("", "http://mock_url_1/?per_page=100", {}, '', True)) + self.assertEqual(mock_auth_get_single_page.mock_calls[1], mock.call("", "http://mock_url_2/?per_page=100", {}, '', True)) + self.assertEqual(mock_auth_get_single_page.mock_calls[2], mock.call("", "http://mock_url_3/?per_page=100", {}, '', True)) diff --git a/tests/unittests/test_rate_limit.py b/tests/unittests/test_rate_limit.py index 987c60a0..770acd29 100644 --- a/tests/unittests/test_rate_limit.py +++ b/tests/unittests/test_rate_limit.py @@ -2,10 +2,12 @@ from tap_github.client import rate_throttling, GithubException import unittest from unittest import mock +from math import ceil import time import requests DEFAULT_SLEEP_SECONDS = 600 +DEFAULT_MIN_REMAIN_RATE_LIMIT = 0 def api_call(): return requests.get("https://api.github.com/rate_limit") @@ -25,16 +27,15 @@ def test_rate_limt_wait(self, mocked_sleep): mocked_sleep.side_effect = None resp = api_call() - resp.headers["X-RateLimit-Reset"] = int(round(time.time(), 0)) + 120 + resp.headers["X-RateLimit-Reset"] = int(ceil(time.time())) + 120 resp.headers["X-RateLimit-Remaining"] = 0 - rate_throttling(resp, DEFAULT_SLEEP_SECONDS) + rate_throttling(resp, DEFAULT_SLEEP_SECONDS, DEFAULT_MIN_REMAIN_RATE_LIMIT) # Verify `time.sleep` is called with expected seconds in response - mocked_sleep.assert_called_with(120) + mocked_sleep.assert_called_with(121) self.assertTrue(mocked_sleep.called) - def test_rate_limit_exception(self, mocked_sleep): """ Test `rate_throttling` for 'sleep_time' greater than `MAX_SLEEP_SECONDS` @@ -43,14 +44,13 @@ def test_rate_limit_exception(self, mocked_sleep): mocked_sleep.side_effect = None resp = api_call() - resp.headers["X-RateLimit-Reset"] = int(round(time.time(), 0)) + 601 + resp.headers["X-RateLimit-Reset"] = int(ceil(time.time())) + 601 resp.headers["X-RateLimit-Remaining"] = 0 # Verify exception is raised with proper message with self.assertRaises(tap_github.client.RateLimitExceeded) as e: - rate_throttling(resp, DEFAULT_SLEEP_SECONDS) - self.assertEqual(str(e.exception), "API rate limit exceeded, please try after 601 seconds.") - + rate_throttling(resp, DEFAULT_SLEEP_SECONDS, DEFAULT_MIN_REMAIN_RATE_LIMIT) + self.assertEqual(str(e.exception), "API rate limit exceeded, please try after 602 seconds.") def test_rate_limit_not_exceeded(self, mocked_sleep): """ @@ -60,14 +60,30 @@ def test_rate_limit_not_exceeded(self, mocked_sleep): mocked_sleep.side_effect = None resp = api_call() - resp.headers["X-RateLimit-Reset"] = int(round(time.time(), 0)) + 10 + resp.headers["X-RateLimit-Reset"] = int(ceil(time.time())) + 10 resp.headers["X-RateLimit-Remaining"] = 5 - rate_throttling(resp, DEFAULT_SLEEP_SECONDS) + rate_throttling(resp, DEFAULT_SLEEP_SECONDS, DEFAULT_MIN_REMAIN_RATE_LIMIT) # Verify that `time.sleep` is not called self.assertFalse(mocked_sleep.called) + def test_rate_limit_wait_with_min_remain_rate_limit_defined(self, mocked_sleep): + """ + Test `rate_throttling` if remain rate limit > 0 and equal to `min_remain_rate_limit` + """ + + mocked_sleep.side_effect = None + + resp = api_call() + resp.headers["X-RateLimit-Reset"] = int(ceil(time.time())) + 10 + resp.headers["X-RateLimit-Remaining"] = 5 + + rate_throttling(resp, DEFAULT_SLEEP_SECONDS, min_remain_rate_limit=5) + + # Verify `time.sleep` is called + self.assertTrue(mocked_sleep.called) + def test_rate_limt_header_not_found(self, mocked_sleep): """ Test that the `rate_throttling` function raises an exception if `X-RateLimit-Reset` key is not found in the header. @@ -76,7 +92,7 @@ def test_rate_limt_header_not_found(self, mocked_sleep): resp.headers={} with self.assertRaises(GithubException) as e: - rate_throttling(resp, DEFAULT_SLEEP_SECONDS) + rate_throttling(resp, DEFAULT_SLEEP_SECONDS, DEFAULT_MIN_REMAIN_RATE_LIMIT) # Verifying the message formed for the invalid base URL self.assertEqual(str(e.exception), "The API call using the specified base url was unsuccessful. Please double-check the provided base URL.") diff --git a/tests/unittests/test_sync_endpoint.py b/tests/unittests/test_sync_endpoint.py index 338d9ea4..4ba60e47 100644 --- a/tests/unittests/test_sync_endpoint.py +++ b/tests/unittests/test_sync_endpoint.py @@ -37,7 +37,7 @@ def test_sync_without_state(self, mock_write_records, mock_authed_all_pages, moc self.assertEqual(final_state, expected_state) # Verify `get_auth_all_pages` called with expected url - mock_authed_all_pages.assert_called_with(mock.ANY, 'https://api.github.com/repos/tap-github/events', mock.ANY, stream='events') + mock_authed_all_pages.assert_called_with(mock.ANY, 'https://api.github.com/repos/tap-github/events', mock.ANY, 'events', True) # Verify `write_records` call count self.assertEqual(mock_write_records.call_count, 4) @@ -65,7 +65,7 @@ def test_sync_with_state(self, mock_write_records, mock_authed_all_pages, mock_v self.assertEqual(mock_write_records.call_count, 3) # Verify `get_auth_all_pages` called with expected url - mock_authed_all_pages.assert_called_with(mock.ANY, 'https://api.github.com/repos/tap-github/events', mock.ANY, stream='events') + mock_authed_all_pages.assert_called_with(mock.ANY, 'https://api.github.com/repos/tap-github/events', mock.ANY, 'events', True) mock_write_records.assert_called_with(mock.ANY, {'id': 4, 'created_at': '2019-01-02T00:00:00Z', '_sdc_repository': 'tap-github'},time_extracted = mock.ANY) @@ -91,7 +91,7 @@ def test_without_child_stream(self, mock_get_child_records, mock_authed_get_all_ test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["stargazers"], ["stargazers"]) # Verify that the authed_get_all_pages() is called with the expected url - mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/stargazers", mock.ANY, stream='stargazers') + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/stargazers", mock.ANY, 'stargazers', True) # Verify that the get_child_records() is not called as Stargazers doesn't have a child stream self.assertFalse(mock_get_child_records.called) @@ -110,7 +110,7 @@ def test_with_child_streams(self, mock_get_child_records, mock_authed_get_all_p test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["teams", "team_members"], ["teams","team_members"]) # Verify that the authed_get_all_pages() is called with the expected url - mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/orgs/tap-github/teams", mock.ANY, stream='teams') + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/orgs/tap-github/teams", mock.ANY, 'teams', True) # Verify that the get_child_records() is called self.assertTrue(mock_get_child_records.called) @@ -136,9 +136,9 @@ def test_with_nested_child_streams(self, mock_authed_get_all_pages, mock_verify_ self.assertEqual(mock_authed_get_all_pages.call_count, 4) # Verify that the authed_get_all_pages() is called with the expected url - exp_call_1 = mock.call(mock.ANY, "https://api.github.com/orgs/tap-github/teams", mock.ANY, stream='teams') - exp_call_2 = mock.call(mock.ANY, "https://api.github.com/orgs/tap-github/teams/stitch-dev/members", stream='team_members') - exp_call_3 = mock.call(mock.ANY, "https://api.github.com/orgs/tap-github/teams/stitch-dev/memberships/log1", stream='team_memberships') + exp_call_1 = mock.call(mock.ANY, "https://api.github.com/orgs/tap-github/teams", mock.ANY, 'teams', True) + exp_call_2 = mock.call(mock.ANY, "https://api.github.com/orgs/tap-github/teams/stitch-dev/members", {}, 'team_members', True) + exp_call_3 = mock.call(mock.ANY, "https://api.github.com/orgs/tap-github/teams/stitch-dev/memberships/log1", {}, 'team_memberships', True) self.assertEqual(mock_authed_get_all_pages.mock_calls[0], exp_call_1) self.assertEqual(mock_authed_get_all_pages.mock_calls[1], exp_call_2) @@ -166,7 +166,7 @@ def test_without_child_stream(self, mock_get_child_records, mock_authed_get_all_ test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["commits"], ["commits"]) # Verify that the authed_get_all_pages() is called with the expected url - mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/commits?since=", mock.ANY, stream='commits') + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/commits?since=", mock.ANY, 'commits', True) # Verify that the get_child_records() is not called as Commits does not contain any child stream. self.assertFalse(mock_get_child_records.called) @@ -184,8 +184,8 @@ def test_with_child_streams(self, mock_get_child_records, mock_authed_get_all_p test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["projects", "project_columns"], ["projects","project_columns"]) # Verify that the authed_get_all_pages() is called with the expected url - mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/projects?state=all", mock.ANY, stream='projects') - + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/projects?state=all", mock.ANY, 'projects', True) + # Verify that the get_child_records() is called as thw Projects stream has a child stream self.assertTrue(mock_get_child_records.called) @@ -208,9 +208,9 @@ def test_with_nested_child_streams(self, mock_authed_get_all_pages, mock_verify_ # Verify that the authed_get_all_pages() is called expected number of times self.assertEqual(mock_authed_get_all_pages.call_count, 4) - exp_call_1 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/projects?state=all", mock.ANY, stream='projects') - exp_call_2 = mock.call(mock.ANY, "https://api.github.com/projects/1/columns", stream='project_columns') - exp_call_3 = mock.call(mock.ANY, "https://api.github.com/projects/columns/1/cards", stream='project_cards') + exp_call_1 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/projects?state=all", mock.ANY, 'projects', True) + exp_call_2 = mock.call(mock.ANY, "https://api.github.com/projects/1/columns", {}, 'project_columns', True) + exp_call_3 = mock.call(mock.ANY, "https://api.github.com/projects/columns/1/cards", {}, 'project_cards', True) # Verify that the API calls are done as expected with the correct url self.assertEqual(mock_authed_get_all_pages.mock_calls[0], exp_call_1) @@ -240,7 +240,7 @@ def test_without_child_stream(self, mock_get_child_records, mock_strptime_to_utc test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["pull_requests"], ["pull_requests"]) # Verify that the authed_get_all_pages() is called with the expected url - mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/pulls?state=all&sort=updated&direction=desc", stream='pull_requests') + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/pulls?state=all&sort=updated&direction=desc", {}, 'pull_requests', True) @mock.patch("tap_github.streams.Stream.get_child_records") @@ -257,7 +257,7 @@ def test_with_child_streams(self, mock_get_child_records, mock_strptime_to_utc, test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["pull_requests", "review_comments"], ["pull_requests","review_comments"]) # Verify that the authed_get_all_pages() is called with the expected url - mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/pulls?state=all&sort=updated&direction=desc", stream='pull_requests') + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/pulls?state=all&sort=updated&direction=desc", {}, 'pull_requests', True) # Verify that the get_child_records() is called as the PullRequests stream has a child stream self.assertTrue(mock_get_child_records.called) @@ -281,8 +281,8 @@ def test_with_nested_child_streams(self, mock_strptime_to_utc, mock_authed_get_a self.assertEqual(mock_authed_get_all_pages.call_count, 2) print(mock_authed_get_all_pages.mock_calls) - exp_call_1 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/pulls?state=all&sort=updated&direction=desc", stream='pull_requests') - exp_call_2 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/pulls/1/comments?sort=updated_at&direction=desc", stream='review_comments') + exp_call_1 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/pulls?state=all&sort=updated&direction=desc", {}, 'pull_requests', True) + exp_call_2 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/pulls/1/comments?sort=updated_at&direction=desc", {}, 'review_comments', True) # Verify that the API calls are done as expected with the correct url self.assertEqual(mock_authed_get_all_pages.mock_calls[0], exp_call_1) diff --git a/tests/unittests/test_timeout.py b/tests/unittests/test_timeout.py index a3f6ca53..09129684 100644 --- a/tests/unittests/test_timeout.py +++ b/tests/unittests/test_timeout.py @@ -73,7 +73,7 @@ def test_timeout_value_in_config(self, mocked_parse_args, mocked_request, mocked # get the timeout value for assertion timeout = test_client.get_request_timeout() # function call - test_client.authed_get("test_source", "") + test_client.authed_get_single_page("test_source", "") # verify that we got expected timeout value self.assertEqual(expected_value, timeout) @@ -107,7 +107,7 @@ def test_backoff(self, mocked_parse_args, mocked_request, mocked_sleep, mock_ver test_client = GithubClient(mock_config) with self.assertRaises(error_class): - test_client.authed_get("test_source", "") + test_client.authed_get_single_page("test_source", "") # verify that we backoff 5 times self.assertEqual(5, mocked_request.call_count) diff --git a/tests/unittests/test_verify_access.py b/tests/unittests/test_verify_access.py index bdd93209..700ae4ed 100644 --- a/tests/unittests/test_verify_access.py +++ b/tests/unittests/test_verify_access.py @@ -60,3 +60,15 @@ def test_repo_bad_creds(self, mocked_parse_args, mocked_request, mock_verify_acc # Verify error with proper message self.assertEqual(str(e.exception), "HTTP-error-code: 401, Error: {}".format(json)) + + def test_repo_no_permission(self, mocked_parse_args, mocked_request, mock_verify_access): + """Verify if 404 error arises""" + test_client = GithubClient(self.config) + json = {"message": "Please check the repository name 'repo' or you do not have sufficient permissions to access this repository.", "documentation_url": "https://docs.github.com/"} + mocked_request.return_value = get_response(404, json, True) + + with self.assertRaises(tap_github.client.NotFoundException) as e: + test_client.verify_repo_access("", "repo") + + # Verify error with proper message + self.assertEqual(str(e.exception), "HTTP-error-code: 404, Error: {}".format(json['message']))