I've started learning Python several months ago. This code uses Google API to send URLs to Googlebot.
I appreciate it if someone can review my code, and advice better practices in classes/method structure, sending requests and logging responses.
How to start the program (link GiHub)
import json
import re
import httplib2
from oauth2client.service_account import ServiceAccountCredentials
class GoogleIndexationAPI:
def __init__(self, key_file, urls_list):
"""
:param key_file: .json key Google API filename
:param urls_list: .txt urls list filename
"""
self.key_file = key_file
self.urls_list = urls_list
def parse_json_key(self):
"""
Hello msg. Parses and validates JSON
"""
with open(self.key_file, 'r') as f:
key_data = json.load(f)
try:
input(f'Please add OWNER rights in GSC resource to: {key_data["client_email"]} \nand press Enter')
except Exception as e:
print(e, type(e))
exit()
@staticmethod
def get_domain():
"""
Input URL and strips it to a domain name
:return stripped_domain:
"""
domain = input('Enter domain you are going to work with: ')
stripped_domain = re.sub('(https://)|(http://)|(www.)|/(.*)', '', domain)
return stripped_domain
@staticmethod
def choose_method():
"""
Choosing a method for Google Indexing API request
:return method: method name
"""
while True:
choose_msg = input('Choose one of methods (print number) and press Enter \n'
'1 - URL_UPDATED\n'
'2 - URL_DELETED:\n')
if '1' in choose_msg:
method = 'URL_UPDATED'
break
elif '2' in choose_msg:
method = 'URL_DELETED'
break
else:
print('Please enter correct number')
print('You chose method: ', method)
return method
def get_urls(self):
"""
Gets URL list from a file and clean from not unique and not valid data
:return final_urls:
"""
urls = []
domain = self.get_domain()
try:
with open(self.urls_list, 'r', encoding='utf-8') as f:
for line in f:
urls.append(line.strip())
# Clean not unique urs
urls = list(set(urls))
# Delete urls without ^http or which don't contain our domain name
for url in urls.copy():
if ('http' not in url) or (domain not in url):
urls.pop(urls.index(url))
# 200 requests a day quota :(
if len(urls) > 200:
print(f'You have a 200 request per day quota. You are trying to index {len(urls)}. ')
len_answer = input(f'I will make requests only for the first 200 urls containing {domain}. '
f'Continue (YES/NO) ???\n')
if 'yes' in len_answer.lower():
final_urls = urls[0:199]
left_urls = urls[200:]
# Write urls over quota limit in file
with open('not_send_urls.txt', 'w', encoding='utf-8') as log:
for item in left_urls:
log.write(f'{item}\n')
print(f'There are {len(left_urls)} not send to Googlebot. \n'
f'Check not_send_urls.txt file in the script folder')
elif 'no' in len_answer.lower():
exit()
else:
print('Please enter correct answer (YES / NO)')
else:
final_urls = urls
if len(final_urls) < 1:
assert print('There are no urls in your file')
exit()
return final_urls
except Exception as e:
print(e, type(e))
exit()
def single_request_index(self, __url, __method):
"""
Makes a request to Google Indexing API with a selected method
:param __url:
:param __method:
:return content:
"""
api_scopes = ["https://www.googleapis.com/auth/indexing"]
api_endpoint = "https://indexing.googleapis.com/v3/urlNotifications:publish"
credentials = ServiceAccountCredentials.from_json_keyfile_name(self.key_file, scopes=api_scopes)
try:
http = credentials.authorize(httplib2.Http())
r_content = """{""" + f"'url': '{__url}', 'type': '{__method}'" + """}"""
response, content = http.request(api_endpoint, method="POST", body=r_content)
return content
except Exception as e:
print(e, type(e))
def indexation_worker(self):
"""
Run this method after class instance creating.
Gets an URL list, parses JSON key file, chooses API method,
then sends a request for an each URL and logs responses.
"""
self.parse_json_key()
urls = self.get_urls()
method = self.choose_method()
with open('logs.txt', 'w', encoding='utf-8') as f:
for url in urls:
result = self.single_request_index(url, method)
f.write(f'{result}\n')
print(f'Done! We\'ve sent {len(urls)} URLs to Googlebot.\n'
f'You can check responses in logs.txt')
if __name__ == '__main__':
g_index = GoogleIndexationAPI('cred.json', 'urls.txt')
g_index.indexation_worker()
-
\$\begingroup\$ Welcome to the Code Review Community. The title should be a brief introduction into what the code does rather than your concerns about the code. Please read How do I ask a good question. \$\endgroup\$pacmaninbw– pacmaninbw ♦2021年09月04日 11:47:19 +00:00Commented Sep 4, 2021 at 11:47
1 Answer 1
- Add PEP484 type hints, such as
get_domain() -> str
- The exception handling here:
try:
input(f'Please add OWNER rights in GSC resource to: {key_data["client_email"]} \nand press Enter')
except Exception as e:
print(e, type(e))
exit()
is a little odd. Do you expect for the exception to come from the fact that client_email
is missing as a key? If so, it's best to check for that directly rather than rely on an exception. Even if you were to rely on the exception, you should narrow the caught type. Finally: this input
doesn't look to belong in this method, and is an interaction that should appear in an area of the code dedicated to user interface.
- Your regular expression
'(https://)|(http://)|(www.)|/(.*)'
doesn't look strict enough; you should probably match on ^http
to make sure you're looking at the beginning of the string. However, I think the sub()
is at odds with what you're asking the user. You're asking for the domain and not a URL, so why bother stripping off the protocol? I'd just let it fail at the network level if they give you something that isn't a domain. Better yet, rather than attempting to be clever and strip off substrings, just loop-validate that it's a valid domain by passing it to socket.gethostbyname
.
Along the same lines,
urls = list(set(urls))
# Delete urls without ^http or which don't contain our domain name
for url in urls.copy():
if ('http' not in url) or (domain not in url):
urls.pop(urls.index(url))
doesn't actually check for ^http
as you've commented, so either you need to use a regex, or use startswith
. Consider running the urls through urllib.parse.urlparse
and verifying that the scheme is either http
or https
, and that the netloc is equal to the given domain.
Also, this block can be replaced with
url = [u for u in set(urls)
if u.startswith('http') and domain in u]
Consider representing the return value of choose_method
as an enum rather than an unconstrained string.
What's this? """{"""
You don't need a triple-quoted string for one brace. Is r_content
supposed to be JSON? If so, you need to replace your single quotes with double quotes. You should also enclose this literal string in double quotes:
print(f'Done! We\'ve sent {len(urls)} URLs to Googlebot.\n'
f'You can check responses in logs.txt')
so that you don't have to escape your apostrophes.