This code should query the github api and filter, out of the last 1000 commits, which ones include and exclude certain words from the top 3 repositories in a given organization.
For example, I could query the endpoint /search
with parameters such as org:facebook
and include:fixed
exclude:try
. This should then return, out of the 3 most popular facebook repositories, and out of the 1000 most recent commits, the commit messages which include and exclude fixed
and try
, respectively.
from flask import Flask, request
import requests
import json
app = Flask(__name__)
NUM_REPOS = 3
MAX_PER_PAGE = 100
NUM_PAGES = 10
HEADERS = {"Accept": "application/vnd.github.cloak-preview"}
def get_matches():
if request.method == 'POST':
try:
org = request.get_json()["org"]
include = request.get_json()["include"]
word_to_ignore = request.get_json()["ignore"]
except:
return {"error": "your request is missing the 'org', 'include' or 'ignore' parameters"}
repo_data = requests.get( f"https://api.github.com/search/repositories?q=org:{org}&sort=stars&order=desc&per_page={NUM_REPOS}").content
repo_data_dict = json.loads(repo_data.decode('utf-8'))
if not "items" in repo_data_dict:
if "errors" in repo_data_dict:
return {"invalid_organization": f"{org} does not exist or doesn't have any repositories"}
else:
return {"reached_api_limit": True}
else:
repo_list = repo_data_dict["items"]
matches = []
for repo in repo_list:
repo_info = {}
repo_name = repo["name"]
repo_info["repo"] = repo_name
page = 1
items_exist = True
total_count = 0
while (items_exist and page <= NUM_PAGES):
commits = requests.get( f"https://api.github.com/search/commits?q={include} repo:{org}/{repo_name}&per_page={MAX_PER_PAGE}&page={page}", headers=HEADERS).content
commits = json.loads(commits.decode('utf-8'))
if "message" in commits:
matches.append({"reached_api_limit": True})
return {"matches": matches}
if "items" in commits and len(commits["items"]) > 0 and (page == 1 or commits["total_count"] > page-1 * MAX_PER_PAGE):
for commit in commits["items"]:
if word_to_ignore in commit["commit"]["message"]:
continue
repo_info["message"] = commit["commit"]["message"]
matches.append(repo_info)
if commits["total_count"] < MAX_PER_PAGE:
items_exist = False
else:
items_exist = False
page += 1
return {"matches": matches}
@ app.route('/search', methods=['POST'])
def example():
return get_matches()
if __name__ == '__main__':
app.run()
-
\$\begingroup\$ Does the code work as expected? \$\endgroup\$AMC– AMC2021年04月29日 13:57:17 +00:00Commented Apr 29, 2021 at 13:57
1 Answer 1
if request.method == 'POST':
is redundant; delete it.
Your
your request is missing the 'org', 'include' or 'ignore' parameters
is less than helpful. Why not tell the user exactly what they're missing?
This except
block:
try:
org = request.get_json()["org"]
include = request.get_json()["include"]
word_to_ignore = request.get_json()["ignore"]
except:
return {"error": "your request is missing the 'org', 'include' or 'ignore' parameters"}
will tell the user that their request is missing a parameter even if the server runs out of memory, or the entire payload is the string banana
instead of a well-formed JSON body; or someone attaches the server process to a console and presses Ctrl+C to break it. Are you sure that all three of these exceptions constitute the user having submitted a well-formed JSON payload missing one or more keys? Never bare except:
.
Move your query string parameters into a params
dictionary sent to requests.
This:
if not "items" in repo_data_dict:
if "errors" in repo_data_dict:
return {"invalid_organization": f"{org} does not exist or doesn't have any repositories"}
else:
return {"reached_api_limit": True}
is a wild and crazy guess. What if 'errors' told you that the server is down for maintenance, or you submitted a malformed payload, or the endpoint is deprecated? You need to pay attention to the contents of the error message given to you.
Don't json.loads
here. Just call .json()
on the response.
Be sure to check for HTTP error conditions via response.raise_for_status()
.
You need to break up get_matches
into multiple methods.
You need to improve separation of concerns - get_matches
returns dictionaries intended for consumption by the REST API, but that shouldn't be done in that function - payload dictionary formation should be done above, in the Flask handler function. To indicate errors, you can throw your own exceptions, catch them in the method above and add that to the response dictionary. To indicate whether you hit the API limit, you can add a boolean to your return tuple.
total_count
is unused; delete it.
Your response format is flat, and this can be inconvenient and inefficient to consume on the client side. Instead consider a dictionary of repositories, each value being a list of messages, rather than adding the repo name to every response element.
This check:
if commits["total_count"] < MAX_PER_PAGE:
items_exist = False
seems to exist at the wrong level of indentation, since it applies to commits
in general and not the current iterated commit
.
This:
commits["total_count"] > page-1 * MAX_PER_PAGE
almost certainly does not do what you want, since it's doing
page - (1*MAX_PER_PAGE)
and not
(page - 1)*MAX_PER_PAGE
so add parens as appropriate.
Some of the above suggestions baked into an example:
from pprint import pprint
from typing import List, Tuple
import requests
NUM_REPOS = 3
MAX_PER_PAGE = 100
NUM_PAGES = 10
HEADERS = {"Accept": "application/vnd.github.cloak-preview"}
class InvalidOrganizationError(Exception):
pass
class APILimitError(Exception):
pass
# @app.route('/search', methods=('POST',))
def handle_search() -> dict:
payload = request.get_json()
try:
org = payload['org']
except KeyError:
return {'error': 'Missing the "org" parameter'}
try:
include = payload['include']
except KeyError:
return {'error': 'Missing the "include" parameter'}
try:
word_to_ignore = payload['word_to_ignore']
except KeyError:
return {'error': 'Missing the "word_to_ignore" parameter'}
matches, api_limited = get_matches(org, include, word_to_ignore)
return {'matches': matches, 'reached_api_limit': api_limited}
def get_repo(org: str) -> dict:
with requests.get(
'https://api.github.com/search/repositories',
params={
'q': f'org:{org}',
'sort': 'stars',
'order': 'desc',
'per_page': NUM_REPOS,
},
headers=HEADERS,
) as resp:
resp.raise_for_status()
repo_data_dict = resp.json()
repo_list = repo_data_dict.get('items')
if repo_list is not None:
return repo_list
if 'errors' in repo_data_dict:
raise InvalidOrganizationError(
f'{org} does not exist or does not have any repositories'
)
raise APILimitError()
def get_commits(org: str, include: str, repo_name: str, page: int) -> dict:
with requests.get(
'https://api.github.com/search/commits',
params={
'q': f'{include} repo:{org}/{repo_name}',
'per_page': MAX_PER_PAGE,
'page': page,
},
headers=HEADERS,
) as resp:
resp.raise_for_status()
return resp.json()
def get_matches(
org: str, include: str, word_to_ignore: str,
) -> Tuple[
List[object], # All matches
bool, # Did we hit the API limit?
]:
repo_list = get_repo(org)
matches = []
for repo in repo_list:
messages = []
repo_info = {'repo': repo['name'], 'messages': messages}
matches.append(repo_info)
page = 1
items_exist = True
while items_exist and page <= NUM_PAGES:
commits = get_commits(org, include, repo['name'], page)
if "message" in commits:
return matches, True
if "items" in commits and (
page == 1 or commits["total_count"] > (page-1) * MAX_PER_PAGE
):
for commit in commits["items"]:
message = commit["commit"]["message"]
if word_to_ignore in message:
continue
messages.append(message)
else:
break
if commits["total_count"] < MAX_PER_PAGE:
items_exist = False
page += 1
return matches, False
def test():
matches, api_limited = get_matches('microsoft', 'repo', 'fetch')
pprint(matches)
if __name__ == '__main__':
test()