URL shortener using Flask

Question 1

from flask import Flask, request, jsonify
import hashlib
import random
import string
app = Flask(__name__)
url_mapping = {}
def generate_short_url(url):
 hash_object = hashlib.md5(url.encode())
 return hash_object.hexdigest()[:6] # Shortened URL is the first 6 characters of the MD5 hash
def is_short_url_unique(short_url):
 return short_url not in url_mapping
def generate_random_string(length):
 characters = string.ascii_letters + string.digits
 return ''.join(random.choice(characters) for i in range(length))
def generate_unique_short_url(url):
 while True:
 short_url = generate_short_url(url)
 if is_short_url_unique(short_url):
 return short_url
@app.route('/shorten', methods=['POST'])
def shorten_url():
 data = request.get_json()
 if 'url' not in data:
 return jsonify({"error": "URL is required"}), 400
 original_url = data['url']
 short_url = generate_unique_short_url(original_url)
 url_mapping[short_url] = original_url
 return jsonify({
 "original_url": original_url,
 "shortened_url": f"https://{request.host}/{short_url}"
 })
@app.route('/<short_url>')
def redirect_to_original_url(short_url):
 if short_url in url_mapping:
 original_url = url_mapping[short_url]
 return jsonify({"original_url": original_url}), 302
 else:
 return jsonify({"error": "Shortened URL not found"}), 404
if __name__ == '__main__':
 app.run(debug=True)

This is a URL shortener API. I am wondering if the algorithm for generating the hash can be improved.

Question 2

I do have a few comments:

Handling Non-Unique Digests

It's good that you recognize that digests such as MD5 are not guaranteed to be unique. But, although you have a function generate_random_string, it does not appear to be used anywhere. Instead, if you find that in calling generate_short_url you have generated a non-unique digest for the URL, you just re-call the same function with the same URL argument. How would this not loop indefinitely if you did get a digest collision?

Here is an idea of how you might modify generate_short_url:

def short_url_generator():
 from itertools import product
 url_chars = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
 for n in range(1, 6):
 for short_url in product(url_chars, repeat=n):
 yield ''.join(short_url)
it = short_url_generator()
def generate_unique_short_url():
 return next(it)

The above generate_unique_short_url function will first generate 62 unique urls of length 1, then 62 unique URLs of length 2, etc. This also provides for the generation of not only more URLs, but shorter ones as well.

Encapsulate in a Class For Greater Reusability and Data Hiding

class UrlGenerator:
 """Generate unique shortened URLs."""
 def __init__(self,
 url_chars: str='0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
 ):
 def generate_short_url():
 from itertools import product
 n = 1
 while True:
 for short_url in product(url_chars, repeat=n):
 yield ''.join(short_url)
 n += 1
 self._it = generate_short_url()
 def next_url(self) -> str:
 """Return the next URL."""
 return next(self._it)
url_generator = UrlGenerator()
def get_unique_short_url():
 return url_generator.next_url()

Note that I have added both docstrings and type hinting and removed any limit in the length of the generated URL so that an infinite number of URLs can be generated.

Update: Saving The Generator's State with Random Shuffling

The one potential problem with the above implementation is that it uses generator functions and as such cannot be "pickled". Being able to save and restore the state of the URL generator becomes necessary if you have to restart an application that uses it. The following code, though a bit more complicated, allows the current state of the generator to be serialized.

This class generates each character in the URL using a separate, randomly shuffled version of the input url_chars string. This makes it more difficult to guess what the next URL generated will be.

...
import string
import random
...
class UrlGenerator:
 """Generate unique shortened URLs."""
 def __init__(self,
 *,
 url_chars: str=(string.digits +
 string.ascii_lowercase +
 string.ascii_uppercase
 ),
 min_length: int=1
 ):
 """The intializer:
 url_chars: Character set to be used for generating the URLs.
 min_length: The minimum length of the generated URL."""
 self._url_chars = list(url_chars)
 self._url_chars_list = []
 for _ in range(min_length):
 self._new_random_chars()
 self._indices = [0] * min_length
 def _new_random_chars(self) -> None:
 """Append the _urls_char_list with the url_chars randomly shuffled."""
 lst = self._url_chars.copy()
 random.shuffle(lst)
 self._url_chars_list.append(''.join(lst))
 def next_url(self) -> str:
 """Generate the next URL."""
 current_index_index = len(self._indices) - 1
 while True:
 current_index = self._indices[current_index_index]
 if current_index < len(self._url_chars):
 break
 self._indices[current_index_index] = 0
 current_index_index -= 1
 if current_index_index < 0:
 self._indices.append(0)
 self._new_random_chars()
 break
 self._indices[current_index_index] += 1
 url = ''.join(
 self._url_chars_list[list_index][char_index]
 for list_index, char_index in enumerate(self._indices)
 )
 self._indices[-1] += 1
 return url
url_generator = UrlGenerator()
urls = [url_generator.next_url() for _ in range(62 * 3)]
print(urls)

Prints:

['F', 'X', 'm', 'G', 'B', 'u', 'E', 'J', 'a', 'v', 'y', '3', 'i', 'K', 'C', 'W', 'e', 'Z', 'N', '0', 't', 'o', 's', 'S', 'k', '4', 'x', 'Y', 'c', 'T', 'Q', 'j', 'n', 'l', 'D', 'P', 'w', 'R', 'L', 'p', 'q', 'd', '5', '2', 'g', '9', '6', '1', 'U', 'M', '8', 'V', 'f', 'O', 'z', 'b', 'r', 'h', 'H', 'A', 'I', '7', 'Fe', 'F6', 'FB', 'Fq', 'FC', 'FW', 'Fo', 'FA', 'FE', 'FR', 'FN', 'Fd', 'Fh', 'Fs', 'FF', 'FM', 'F3', 'FY', 'FG', 'Fk', 'F9', 'FV', 'FK', 'FT', 'F1', 'Fb', 'FX', 'Fj', 'FI', 'Fw', 'Fz', 'FH', 'Fc', 'Fa', 'Ff', 'Fx', 'FO', 'Fp', 'Fi', 'Fu', 'Fy', 'F4', 'F7', 'Fg', 'F2', 'FJ', 'FZ', 'FQ', 'FU', 'F5', 'Ft', 'F0', 'FP', 'F8', 'FD', 'Fm', 'Fr', 'FS', 'Fn', 'FL', 'Fv', 'Fl', 'Xe', 'X6', 'XB', 'Xq', 'XC', 'XW', 'Xo', 'XA', 'XE', 'XR', 'XN', 'Xd', 'Xh', 'Xs', 'XF', 'XM', 'X3', 'XY', 'XG', 'Xk', 'X9', 'XV', 'XK', 'XT', 'X1', 'Xb', 'XX', 'Xj', 'XI', 'Xw', 'Xz', 'XH', 'Xc', 'Xa', 'Xf', 'Xx', 'XO', 'Xp', 'Xi', 'Xu', 'Xy', 'X4', 'X7', 'Xg', 'X2', 'XJ', 'XZ', 'XQ', 'XU', 'X5', 'Xt', 'X0', 'XP', 'X8', 'XD', 'Xm', 'Xr', 'XS', 'Xn', 'XL', 'Xv', 'Xl']

Question 3

What about: import string and: url_chars = string.digits + string.ascii_lowercase + string.ascii_uppercase

Question 4

@Kate That's a fine suggestion that I have incorporated in the serializable version.

Question 5

organize imports

from flask import ...
import hashlib

PEP-8 asks that you organize your imports in three sections. As written, you are suggesting to me that a pip install happened and you are importing from a pypi project, rather than using batteries included.

Life is too short for your to sweat such details. Let a utility worry about it.

isort your imports, so you don't have to.

BTW, on the topic of such minutiae, thank you for the nice __main__ guard.

explicit contract

def generate_short_url(url):
 hash_object = hashlib.md5(url.encode())

The function name is brilliant, very informative. The identifier for the parameter is similarly brilliant, couldn't be improved. But alas! It is ambiguous. One might plausibly view an "url" here as the namedtuple that urlparse returns. This would be a stronger signature with type annotation:

def generate_short_url(url: str) -> str:

It would really benefit from the addition of a doctest docstring which shows how "https://example.com" is transformed. A difficulty that runs through this codebase is whether "url" denotes a relative or a fully-qualified URL containing scheme and so on. Spelling out an example would help to clarify the contract.

It is well worth stressing in the docstring that we generate a relative url, which omits e.g. scheme and hostname.

Kerckhoffs's principle

Consider prepending a secret PEPPER constant to the input url before hashing it, e.g. "x42" or some GUID. Then an adversary ignorant of the pepper can't make offline attempts to produce a colliding URL pair.

A very nice aspect of shortening with a hash rather than with a counter is that if Alice and Bob are interacting with the generator, each occasionally shortening a new URL, their activity is opaque to one another. They cannot easily learn the activity rate of the other participant, without resorting to an out-of-band method. Sometimes concealing such details has business value.

weak hash

We opted for an output space of 2 ** 24 distinct hashes, suggesting that after ballpark four thousand inputs we're likely to encounter collisions. This seems a small enough number that it's worth briefly discussing the rationale and design tradeoffs in the docstring.

When faced with "budget is six ascii characters", many designers would opt for base64 encoding, as it lets us stuff 6 bits per character, rather than hexadecimal's 4 bits. Notice that 36 bits modulo 8 is nonzero, leading to == padding in the default encoding. But you don't have to recover the (fixed) length from the encoding, so you can just send in 40 or 48 bits and preserve the first six encoded characters. Call this "pseudo base64" if you like. Recall that we will want the url-safe variant.

MD5 has known compromises / collisions. Prefer SHA3 for new designs.

persistence

url_mapping = {}

Consider using the json module to dump / load this datastructure. Or sqlite or even the SleepyCat key-value store.

Consider logging each input url to a text file, so a subsequent reporting script could detect colliding urls. As it stands we'd have to examine the global flask or nginx log, with lots of unrelated requests mixed in, and POST parameters perhaps not visible.

infinite loop

def generate_unique_short_url(url):
 while True:
 short_url = generate_short_url(url)
 if is_short_url_unique(short_url):
 return short_url

This would only make sense if we were repeatedly assigning generate_random_string. The loop intends to enforce a loop variant: "short_url shall w.h.p. be different on each iteration." But as written, it is guaranteed to be identical on each iteration, due to using a deterministic hash function.

The caller, shorten_url(), doesn't check for a duplicate submission and then suppress re-generating for same url.

loss of protocol

 return jsonify({
 "original_url": original_url,
 "shortened_url": f"https://{request.host}/{short_url}"
 })

This is a bit terrible.

Please assert original_url.startswith("https://"), if that is your intent.

As written, "http://example.com" and "ftp://example.com" will both be shortened to the same thing, and it won't be what the end user intended.

In a similar vein, using urlparse() output would let you normalize some of the input, so we don't get different results from these:

HTTP://example.COM
hTTp://eXample.com

Granted, in an IDNA world this can be a complex topic. At some point the end user perhaps deserves what they get.

dict .get()

 if short_url in url_mapping:
 ... = url_mapping[short_url]

This is less convenient than unconditionally assigning url_mapping.get(short_url), and then testing whether the result is None.

design of your URL namespace

@app.route('/shorten', methods=['POST'])
...
@app.route('/<short_url>')

I know we're trying to shorten URLs, but consider making that last one slightly longer:

@app.route('/s/<short_url>')

This carves out distinct namespace, which you'll be thankful for a few months down the road.

It happens that /shorten uses a 7-character identifier which won't collide with 6-character short URLs. But what if we later want a few more bits of collision resistance? Things change.

Another way to carve out namespace is to allocate a new vhost domain name, dedicated exclusively to serving redirects.

status code

I feel the 400 and 404 magic numbers are obvious and Fine as-is.

 return jsonify({"original_url": original_url}), 302

If you're going with "old style", this seems more like a "301 Moved Permanently". For newly authored code, prefer "308 Permanent Redirect".

I will note in passing that werkzeug's HTTP_STATUS_CODES offers these in symbolic form.

This code achieves its design goals (modulo hanging, and losing its map upon restart).

I would be willing to delegate or accept maintenance tasks on it.

Question 6

Your critique "loss of protocol" incorrect/misguided. The request.host is @booboo's host name. So if I pass in the URL https://google.com/search?q=the-answer-to-life-the-universe-everything, the returned shortened_url would look something like https://booboo.com/a3bf1c

Question 7

@BeeH. yeah, I think I agree with you that Author failed to convey a technical idea, or that in my (frequent!) foolishness and fallibility I failed to grasp the idea. This comes back to the frequent interchange / ambiguity in this codebase between full and relative URLs when we see "url" mentioned. Given the observed diversity of interpretations so far imposed, it suggests that being more rigorous and explicit about use of the term would, in future months, be a boon to maintenance engineers. I have grown accustomed in various codebases to seeing BASE_URL + foo, which dispels ambiguity.

Question 8

At this time, there is no data persistence. If you used a database, then each record would have its own incremented ID, and you could use the ID as an element of "uniqueness" to derive the hash. Then, the hash should logically be recorded to the table with a UNIQUE constraint.

Also, it is not uncommon for URL shortener services to let users choose their own identifier, as long as it is not already in use. This could be an optional parameter.

Some users like to have vanity URLs to make them more memorable or better stand out. Eg: https://shorturl.at/sayaman

Then they may want to have an identifier slightly longer (or shorter) than 6 characters. So I would allow a variable length for the "hash".

The code is short and to the point, easy to read and follow. The thing that is less convincing is the loop in generate_unique_short_url.

For better randomness, consider using secrets.choice(). But your identifiers are too short to avoid a reasonable risk of collision. In practice it would take a lot of iterations for this to happen.

Seems to me that you could really use something more rudimentary like encoding the record ID using base64 scheme or similar + optional padding if desired when the resulting string is very short.

By the way, you are currently using a range a 62 characters, this looks almost like bas64. This is bas62 actually.

Question 9

I would be using what you call "base62" but the OP's code is producing a hex digest which uses 26 possible characters: 0...9a...f

Booboo Booboo 3,3164 silver badges14 bronze badges · Accepted Answer · 2024-03-25 13:51:24Z

I do have a few comments:

Handling Non-Unique Digests

It's good that you recognize that digests such as MD5 are not guaranteed to be unique. But, although you have a function generate_random_string, it does not appear to be used anywhere. Instead, if you find that in calling generate_short_url you have generated a non-unique digest for the URL, you just re-call the same function with the same URL argument. How would this not loop indefinitely if you did get a digest collision?

Here is an idea of how you might modify generate_short_url:

def short_url_generator():
 from itertools import product
 url_chars = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
 for n in range(1, 6):
 for short_url in product(url_chars, repeat=n):
 yield ''.join(short_url)
it = short_url_generator()
def generate_unique_short_url():
 return next(it)

The above generate_unique_short_url function will first generate 62 unique urls of length 1, then 62 unique URLs of length 2, etc. This also provides for the generation of not only more URLs, but shorter ones as well.

Encapsulate in a Class For Greater Reusability and Data Hiding

class UrlGenerator:
 """Generate unique shortened URLs."""
 def __init__(self,
 url_chars: str='0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
 ):
 def generate_short_url():
 from itertools import product
 n = 1
 while True:
 for short_url in product(url_chars, repeat=n):
 yield ''.join(short_url)
 n += 1
 self._it = generate_short_url()
 def next_url(self) -> str:
 """Return the next URL."""
 return next(self._it)
url_generator = UrlGenerator()
def get_unique_short_url():
 return url_generator.next_url()

Note that I have added both docstrings and type hinting and removed any limit in the length of the generated URL so that an infinite number of URLs can be generated.

Update: Saving The Generator's State with Random Shuffling

The one potential problem with the above implementation is that it uses generator functions and as such cannot be "pickled". Being able to save and restore the state of the URL generator becomes necessary if you have to restart an application that uses it. The following code, though a bit more complicated, allows the current state of the generator to be serialized.

This class generates each character in the URL using a separate, randomly shuffled version of the input url_chars string. This makes it more difficult to guess what the next URL generated will be.

...
import string
import random
...
class UrlGenerator:
 """Generate unique shortened URLs."""
 def __init__(self,
 *,
 url_chars: str=(string.digits +
 string.ascii_lowercase +
 string.ascii_uppercase
 ),
 min_length: int=1
 ):
 """The intializer:
 url_chars: Character set to be used for generating the URLs.
 min_length: The minimum length of the generated URL."""
 self._url_chars = list(url_chars)
 self._url_chars_list = []
 for _ in range(min_length):
 self._new_random_chars()
 self._indices = [0] * min_length
 def _new_random_chars(self) -> None:
 """Append the _urls_char_list with the url_chars randomly shuffled."""
 lst = self._url_chars.copy()
 random.shuffle(lst)
 self._url_chars_list.append(''.join(lst))
 def next_url(self) -> str:
 """Generate the next URL."""
 current_index_index = len(self._indices) - 1
 while True:
 current_index = self._indices[current_index_index]
 if current_index < len(self._url_chars):
 break
 self._indices[current_index_index] = 0
 current_index_index -= 1
 if current_index_index < 0:
 self._indices.append(0)
 self._new_random_chars()
 break
 self._indices[current_index_index] += 1
 url = ''.join(
 self._url_chars_list[list_index][char_index]
 for list_index, char_index in enumerate(self._indices)
 )
 self._indices[-1] += 1
 return url
url_generator = UrlGenerator()
urls = [url_generator.next_url() for _ in range(62 * 3)]
print(urls)

Prints:

['F', 'X', 'm', 'G', 'B', 'u', 'E', 'J', 'a', 'v', 'y', '3', 'i', 'K', 'C', 'W', 'e', 'Z', 'N', '0', 't', 'o', 's', 'S', 'k', '4', 'x', 'Y', 'c', 'T', 'Q', 'j', 'n', 'l', 'D', 'P', 'w', 'R', 'L', 'p', 'q', 'd', '5', '2', 'g', '9', '6', '1', 'U', 'M', '8', 'V', 'f', 'O', 'z', 'b', 'r', 'h', 'H', 'A', 'I', '7', 'Fe', 'F6', 'FB', 'Fq', 'FC', 'FW', 'Fo', 'FA', 'FE', 'FR', 'FN', 'Fd', 'Fh', 'Fs', 'FF', 'FM', 'F3', 'FY', 'FG', 'Fk', 'F9', 'FV', 'FK', 'FT', 'F1', 'Fb', 'FX', 'Fj', 'FI', 'Fw', 'Fz', 'FH', 'Fc', 'Fa', 'Ff', 'Fx', 'FO', 'Fp', 'Fi', 'Fu', 'Fy', 'F4', 'F7', 'Fg', 'F2', 'FJ', 'FZ', 'FQ', 'FU', 'F5', 'Ft', 'F0', 'FP', 'F8', 'FD', 'Fm', 'Fr', 'FS', 'Fn', 'FL', 'Fv', 'Fl', 'Xe', 'X6', 'XB', 'Xq', 'XC', 'XW', 'Xo', 'XA', 'XE', 'XR', 'XN', 'Xd', 'Xh', 'Xs', 'XF', 'XM', 'X3', 'XY', 'XG', 'Xk', 'X9', 'XV', 'XK', 'XT', 'X1', 'Xb', 'XX', 'Xj', 'XI', 'Xw', 'Xz', 'XH', 'Xc', 'Xa', 'Xf', 'Xx', 'XO', 'Xp', 'Xi', 'Xu', 'Xy', 'X4', 'X7', 'Xg', 'X2', 'XJ', 'XZ', 'XQ', 'XU', 'X5', 'Xt', 'X0', 'XP', 'X8', 'XD', 'Xm', 'Xr', 'XS', 'Xn', 'XL', 'Xv', 'Xl']

What about: import string and: url_chars = string.digits + string.ascii_lowercase + string.ascii_uppercase
@Kate That's a fine suggestion that I have incorporated in the serializable version.

Stack Exchange Network

URL shortener using Flask

3 Answers 3

Handling Non-Unique Digests

Encapsulate in a Class For Greater Reusability and Data Hiding

Update: Saving The Generator's State with Random Shuffling

organize imports

explicit contract

Kerckhoffs's principle

weak hash

persistence

infinite loop

loss of protocol

dict .get()

design of your URL namespace

status code

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

URL shortener using Flask

3 Answers 3

Handling Non-Unique Digests

Encapsulate in a Class For Greater Reusability and Data Hiding

Update: Saving The Generator's State with Random Shuffling

organize imports

explicit contract

Kerckhoffs's principle

weak hash

persistence

infinite loop

loss of protocol

dict .get()

design of your URL namespace

status code

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions