from flask import Flask, request, jsonify
import hashlib
import random
import string
app = Flask(__name__)
url_mapping = {}
def generate_short_url(url):
hash_object = hashlib.md5(url.encode())
return hash_object.hexdigest()[:6] # Shortened URL is the first 6 characters of the MD5 hash
def is_short_url_unique(short_url):
return short_url not in url_mapping
def generate_random_string(length):
characters = string.ascii_letters + string.digits
return ''.join(random.choice(characters) for i in range(length))
def generate_unique_short_url(url):
while True:
short_url = generate_short_url(url)
if is_short_url_unique(short_url):
return short_url
@app.route('/shorten', methods=['POST'])
def shorten_url():
data = request.get_json()
if 'url' not in data:
return jsonify({"error": "URL is required"}), 400
original_url = data['url']
short_url = generate_unique_short_url(original_url)
url_mapping[short_url] = original_url
return jsonify({
"original_url": original_url,
"shortened_url": f"https://{request.host}/{short_url}"
})
@app.route('/<short_url>')
def redirect_to_original_url(short_url):
if short_url in url_mapping:
original_url = url_mapping[short_url]
return jsonify({"original_url": original_url}), 302
else:
return jsonify({"error": "Shortened URL not found"}), 404
if __name__ == '__main__':
app.run(debug=True)
This is a URL shortener API. I am wondering if the algorithm for generating the hash can be improved.
3 Answers 3
I do have a few comments:
Handling Non-Unique Digests
It's good that you recognize that digests such as MD5
are not guaranteed to be unique. But, although you have a function generate_random_string
, it does not appear to be used anywhere. Instead, if you find that in calling generate_short_url
you have generated a non-unique digest for the URL, you just re-call the same function with the same URL argument. How would this not loop indefinitely if you did get a digest collision?
Here is an idea of how you might modify generate_short_url
:
def short_url_generator():
from itertools import product
url_chars = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
for n in range(1, 6):
for short_url in product(url_chars, repeat=n):
yield ''.join(short_url)
it = short_url_generator()
def generate_unique_short_url():
return next(it)
The above generate_unique_short_url
function will first generate 62 unique urls of length 1, then 62 unique URLs of length 2, etc. This also provides for the generation of not only more URLs, but shorter ones as well.
Encapsulate in a Class For Greater Reusability and Data Hiding
class UrlGenerator:
"""Generate unique shortened URLs."""
def __init__(self,
url_chars: str='0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
):
def generate_short_url():
from itertools import product
n = 1
while True:
for short_url in product(url_chars, repeat=n):
yield ''.join(short_url)
n += 1
self._it = generate_short_url()
def next_url(self) -> str:
"""Return the next URL."""
return next(self._it)
url_generator = UrlGenerator()
def get_unique_short_url():
return url_generator.next_url()
Note that I have added both docstrings and type hinting and removed any limit in the length of the generated URL so that an infinite number of URLs can be generated.
Update: Saving The Generator's State with Random Shuffling
The one potential problem with the above implementation is that it uses generator functions and as such cannot be "pickled". Being able to save and restore the state of the URL generator becomes necessary if you have to restart an application that uses it. The following code, though a bit more complicated, allows the current state of the generator to be serialized.
This class generates each character in the URL using a separate, randomly shuffled version of the input url_chars
string. This makes it more difficult to guess what the next URL generated will be.
...
import string
import random
...
class UrlGenerator:
"""Generate unique shortened URLs."""
def __init__(self,
*,
url_chars: str=(string.digits +
string.ascii_lowercase +
string.ascii_uppercase
),
min_length: int=1
):
"""The intializer:
url_chars: Character set to be used for generating the URLs.
min_length: The minimum length of the generated URL."""
self._url_chars = list(url_chars)
self._url_chars_list = []
for _ in range(min_length):
self._new_random_chars()
self._indices = [0] * min_length
def _new_random_chars(self) -> None:
"""Append the _urls_char_list with the url_chars randomly shuffled."""
lst = self._url_chars.copy()
random.shuffle(lst)
self._url_chars_list.append(''.join(lst))
def next_url(self) -> str:
"""Generate the next URL."""
current_index_index = len(self._indices) - 1
while True:
current_index = self._indices[current_index_index]
if current_index < len(self._url_chars):
break
self._indices[current_index_index] = 0
current_index_index -= 1
if current_index_index < 0:
self._indices.append(0)
self._new_random_chars()
break
self._indices[current_index_index] += 1
url = ''.join(
self._url_chars_list[list_index][char_index]
for list_index, char_index in enumerate(self._indices)
)
self._indices[-1] += 1
return url
url_generator = UrlGenerator()
urls = [url_generator.next_url() for _ in range(62 * 3)]
print(urls)
Prints:
['F', 'X', 'm', 'G', 'B', 'u', 'E', 'J', 'a', 'v', 'y', '3', 'i', 'K', 'C', 'W', 'e', 'Z', 'N', '0', 't', 'o', 's', 'S', 'k', '4', 'x', 'Y', 'c', 'T', 'Q', 'j', 'n', 'l', 'D', 'P', 'w', 'R', 'L', 'p', 'q', 'd', '5', '2', 'g', '9', '6', '1', 'U', 'M', '8', 'V', 'f', 'O', 'z', 'b', 'r', 'h', 'H', 'A', 'I', '7', 'Fe', 'F6', 'FB', 'Fq', 'FC', 'FW', 'Fo', 'FA', 'FE', 'FR', 'FN', 'Fd', 'Fh', 'Fs', 'FF', 'FM', 'F3', 'FY', 'FG', 'Fk', 'F9', 'FV', 'FK', 'FT', 'F1', 'Fb', 'FX', 'Fj', 'FI', 'Fw', 'Fz', 'FH', 'Fc', 'Fa', 'Ff', 'Fx', 'FO', 'Fp', 'Fi', 'Fu', 'Fy', 'F4', 'F7', 'Fg', 'F2', 'FJ', 'FZ', 'FQ', 'FU', 'F5', 'Ft', 'F0', 'FP', 'F8', 'FD', 'Fm', 'Fr', 'FS', 'Fn', 'FL', 'Fv', 'Fl', 'Xe', 'X6', 'XB', 'Xq', 'XC', 'XW', 'Xo', 'XA', 'XE', 'XR', 'XN', 'Xd', 'Xh', 'Xs', 'XF', 'XM', 'X3', 'XY', 'XG', 'Xk', 'X9', 'XV', 'XK', 'XT', 'X1', 'Xb', 'XX', 'Xj', 'XI', 'Xw', 'Xz', 'XH', 'Xc', 'Xa', 'Xf', 'Xx', 'XO', 'Xp', 'Xi', 'Xu', 'Xy', 'X4', 'X7', 'Xg', 'X2', 'XJ', 'XZ', 'XQ', 'XU', 'X5', 'Xt', 'X0', 'XP', 'X8', 'XD', 'Xm', 'Xr', 'XS', 'Xn', 'XL', 'Xv', 'Xl']
-
1\$\begingroup\$ What about:
import string
and:url_chars = string.digits + string.ascii_lowercase + string.ascii_uppercase
\$\endgroup\$Kate– Kate2024年03月25日 15:41:10 +00:00Commented Mar 25, 2024 at 15:41 -
\$\begingroup\$ @Kate That's a fine suggestion that I have incorporated in the serializable version. \$\endgroup\$Booboo– Booboo2024年03月25日 15:52:20 +00:00Commented Mar 25, 2024 at 15:52
organize imports
from flask import ...
import hashlib
PEP-8
asks that you organize your import
s in three sections.
As written, you are suggesting to me that a pip install
happened and you are importing from a
pypi project,
rather than using
batteries included.
Life is too short for your to sweat such details. Let a utility worry about it.
isort your imports, so you don't have to.
BTW, on the topic of such minutiae,
thank you for the nice __main__
guard.
explicit contract
def generate_short_url(url):
hash_object = hashlib.md5(url.encode())
The function name is brilliant, very informative. The identifier for the parameter is similarly brilliant, couldn't be improved. But alas! It is ambiguous. One might plausibly view an "url" here as the namedtuple that urlparse returns. This would be a stronger signature with type annotation:
def generate_short_url(url: str) -> str:
It would really benefit from the addition of a doctest docstring which shows how "https://example.com" is transformed. A difficulty that runs through this codebase is whether "url" denotes a relative or a fully-qualified URL containing scheme and so on. Spelling out an example would help to clarify the contract.
It is well worth stressing in the docstring that we generate a relative url, which omits e.g. scheme and hostname.
Kerckhoffs's principle
Consider prepending a secret PEPPER constant
to the input url before hashing it, e.g. "x42"
or some GUID.
Then an adversary ignorant of the pepper
can't make offline attempts to produce a colliding URL pair.
A very nice aspect of shortening with a hash rather than with a counter is that if Alice and Bob are interacting with the generator, each occasionally shortening a new URL, their activity is opaque to one another. They cannot easily learn the activity rate of the other participant, without resorting to an out-of-band method. Sometimes concealing such details has business value.
weak hash
We opted for an output space of 2 ** 24 distinct hashes, suggesting that after ballpark four thousand inputs we're likely to encounter collisions. This seems a small enough number that it's worth briefly discussing the rationale and design tradeoffs in the docstring.
When faced with "budget is six ascii characters", many
designers would opt for base64 encoding, as it lets us
stuff 6 bits per character, rather than hexadecimal's 4 bits.
Notice that 36 bits modulo 8 is nonzero, leading to ==
padding in the default encoding. But you don't have to
recover the (fixed) length from the encoding, so you can just
send in 40 or 48 bits and preserve the first six encoded characters.
Call this "pseudo base64" if you like.
Recall that we will want the
url-safe
variant.
MD5 has known compromises / collisions. Prefer SHA3 for new designs.
persistence
url_mapping = {}
Consider using the json module to dump / load this datastructure. Or sqlite or even the SleepyCat key-value store.
Consider logging each input url
to a text file,
so a subsequent reporting script could detect colliding urls.
As it stands we'd have to examine the global flask or nginx log,
with lots of unrelated requests mixed in,
and POST parameters perhaps not visible.
infinite loop
def generate_unique_short_url(url):
while True:
short_url = generate_short_url(url)
if is_short_url_unique(short_url):
return short_url
This would only make sense if we were
repeatedly assigning generate_random_string
.
The loop intends to enforce a loop variant: "short_url shall
w.h.p.
be different on each iteration."
But as written, it is guaranteed to be identical on each iteration,
due to using a deterministic hash function.
The caller, shorten_url()
, doesn't check for a duplicate submission
and then suppress re-generating for same url.
loss of protocol
return jsonify({
"original_url": original_url,
"shortened_url": f"https://{request.host}/{short_url}"
})
This is a bit terrible.
Please assert original_url.startswith("https://")
,
if that is your intent.
As written, "http://example.com" and "ftp://example.com" will both be shortened to the same thing, and it won't be what the end user intended.
In a similar vein, using urlparse() output would let you normalize some of the input, so we don't get different results from these:
HTTP://example.COM
hTTp://eXample.com
Granted, in an IDNA world this can be a complex topic. At some point the end user perhaps deserves what they get.
dict .get()
if short_url in url_mapping:
... = url_mapping[short_url]
This is less convenient than unconditionally
assigning url_mapping.get(short_url)
,
and then testing whether the result is None
.
design of your URL namespace
@app.route('/shorten', methods=['POST'])
...
@app.route('/<short_url>')
I know we're trying to shorten URLs, but consider making that last one slightly longer:
@app.route('/s/<short_url>')
This carves out distinct namespace, which you'll be thankful for a few months down the road.
It happens that /shorten
uses a 7-character identifier
which won't collide with 6-character short URLs.
But what if we later want a few more bits of collision resistance?
Things change.
Another way to carve out namespace is to allocate a new vhost domain name, dedicated exclusively to serving redirects.
status code
I feel the 400
and 404
magic numbers are obvious and Fine as-is.
return jsonify({"original_url": original_url}), 302
If you're going with "old style", this seems more like a "301 Moved Permanently". For newly authored code, prefer "308 Permanent Redirect".
I will note in passing that werkzeug's HTTP_STATUS_CODES offers these in symbolic form.
This code achieves its design goals (modulo hanging, and losing its map upon restart).
I would be willing to delegate or accept maintenance tasks on it.
-
\$\begingroup\$ Your critique "loss of protocol" incorrect/misguided. The
request.host
is @booboo's host name. So if I pass in the URLhttps://google.com/search?q=the-answer-to-life-the-universe-everything
, the returnedshortened_url
would look something likehttps://booboo.com/a3bf1c
\$\endgroup\$Bee H.– Bee H.2024年03月25日 20:49:15 +00:00Commented Mar 25, 2024 at 20:49 -
\$\begingroup\$ @BeeH. yeah, I think I agree with you that Author failed to convey a technical idea, or that in my (frequent!) foolishness and fallibility I failed to grasp the idea. This comes back to the frequent interchange / ambiguity in this codebase between full and relative URLs when we see "url" mentioned. Given the observed diversity of interpretations so far imposed, it suggests that being more rigorous and explicit about use of the term would, in future months, be a boon to maintenance engineers. I have grown accustomed in various codebases to seeing BASE_URL + foo, which dispels ambiguity. \$\endgroup\$J_H– J_H2024年03月25日 21:53:14 +00:00Commented Mar 25, 2024 at 21:53
At this time, there is no data persistence. If you used a database, then each record would have its own incremented ID, and you could use the ID as an element of "uniqueness" to derive the hash. Then, the hash should logically be recorded to the table with a UNIQUE constraint.
Also, it is not uncommon for URL shortener services to let users choose their own identifier, as long as it is not already in use. This could be an optional parameter.
Some users like to have vanity URLs to make them more memorable or better stand out. Eg: https://shorturl.at/sayaman
Then they may want to have an identifier slightly longer (or shorter) than 6 characters. So I would allow a variable length for the "hash".
The code is short and to the point, easy to read and follow. The thing that is less convincing is the loop in generate_unique_short_url
.
For better randomness, consider using secrets.choice()
. But your identifiers are too short to avoid a reasonable risk of collision. In practice it would take a lot of iterations for this to happen.
Seems to me that you could really use something more rudimentary like encoding the record ID using base64 scheme or similar + optional padding if desired when the resulting string is very short.
By the way, you are currently using a range a 62 characters, this looks almost like bas64. This is bas62 actually.
-
\$\begingroup\$ I would be using what you call "base62" but the OP's code is producing a hex digest which uses 26 possible characters:
0...9a...f
\$\endgroup\$Booboo– Booboo2024年03月26日 11:18:33 +00:00Commented Mar 26, 2024 at 11:18