Merging sorted arrays in Python

Question 1


def merge_arrays(list1, list2):
 len_list1 = len(list1); len_list2 = len(list2)
 merge_len = len_list1 + len_list2
 merge_list = []
 l1_ptr = 0
 l2_ptr = 0
 # import pdb; pdb.set_trace()
 while(l1_ptr <= len_list1-1 and l2_ptr <= len_list2-1):
 if (list1[l1_ptr] <= list2[l2_ptr]):
 merge_list.append(list1[l1_ptr])
 l1_ptr += 1
 
 elif (list1[l1_ptr] > list2[l2_ptr]):
 merge_list.append(list2[l2_ptr])
 l2_ptr += 1
 
 if l1_ptr > len_list1-1: #list1 exhausted
 for item in list2[l2_ptr:]:
 merge_list.append(item)
 else:
 for item in list1[l1_ptr:]:
 merge_list.append(item)
 return merge_list

I am trying to merge sorted arrays in python. How can I improve this? Honestly it looks like I've written this in C and not in Python

Question 2

There's a builtin function for this from heapq import merge... was this intentionally reinventing the wheel?

Question 3

@Gerrit0 Or just sorted(list1 + list2).

Question 4

@superbrain that doesn't take into account that both lists have been sorted already

Question 5

@MaartenFabré It does. I guess you're not familiar with Timsort?

Question 6

@MaartenFabré In particular, it also only takes linear time. And in some tests I just did, it was about 18 times faster than list(merge(list1, list2)).

Question 7

various

merge_len is unused
the extra parentheses around the simple checks are unnecessary
l1_ptr <= len_list1-1 can be made more clearer as l1_ptr < len_list1
using the variable name l1_ptr to save a few characters while making it harder to guess from the name what it does is not useful

Working with the indices directly is not really pythonic indeed. You can make this more generic, using iter and next, and work for all iterables.

typing

add typing information:

import typing
T = typing.TypeVar("T")
def merge_sorted_iterables(
 iterable1: typing.Iterable[T], iterable2: typing.Iterable[T]
) -> typing.Iterable[T]:

This is extra explanation for the user of this function (and his IDE).

docstring

Add some explanation on what the method does, expects from the caller, and returns.

def merge_sorted_iterables(
 iterable1: typing.Iterable[T], iterable2: typing.Iterable[T]
) -> typing.Iterable[T]:
 """Merge 2 sorted iterables.
 
 The items in the iterables need to be comparable (and support `<=`).
 ...
 """

iterator

Instead of keeping track of the index you can use iter and next. You don't even need to add the items to a list, you can yield them, so the caller of the method can decide in what way he wants to use this.

done = object()
iterator1 = iter(iterable1)
iterator2 = iter(iterable2)
item1 = next(iterator1, done)
item2 = next(iterator2, done)
while item1 is not done and item2 is not done:
 if item1 <= item2:
 yield item1
 item1 = next(iterator1, done)
 else:
 yield item2
 item2 = next(iterator2, done)

Then all that needs to be done is continue the iterator that is not finished

 if item1 is not done:
 yield item1
 yield from iterator1
 if item2 is not done:
 yield item2
 yield from iterator2

import typing
T = typing.TypeVar("T")
def merge_sorted_iterables(
 iterable1: typing.Iterable[T], iterable2: typing.Iterable[T]
) -> typing.Iterable[T]:
 """Merge 2 sorted iterables.
 
 The items in the iterables need to be comparable (and support `<=`).
 ...
 """
 done = object()
 
 iterator1 = iter(iterable1)
 iterator2 = iter(iterable2)
 
 item1 = next(iterator1, done)
 item2 = next(iterator2, done)
 
 while item1 is not done and item2 is not done:
 if item1 <= item2:
 yield item1
 item1 = next(iterator1, done)
 else:
 yield item2
 item2 = next(iterator2, done)
 if item1 is not done:
 yield item1
 yield from iterator1
 if item2 is not done:
 yield item2
 yield from iterator2

testing

You can test the behaviour, starting with the simplest cases:

import pytest
def test_empty():
 expected = []
 result = list(merge_sorted_iterables([], []))
 assert result == expected
def test_single():
 expected = [0, 1, 2]
 result = list(merge_sorted_iterables([], range(3)))
 assert expected == result
 result = list(merge_sorted_iterables(range(3), [],))
 assert expected == result
def test_simple():
 expected = [0, 1, 2, 3, 4, 5]
 result = list(merge_sorted_iterables([0, 1, 2], [3, 4, 5]))
 assert result == expected
 result = list(merge_sorted_iterables([0, 2, 4], [1, 3, 5]))
 assert result == expected
 result = list(merge_sorted_iterables([3, 4, 5], [0, 1, 2],))
 assert result == expected
def test_string():
 expected = list("abcdef")
 result = list(merge_sorted_iterables("abc", "def"))
 assert result == expected
 result = list(merge_sorted_iterables("ace", "bdf"))
 assert result == expected
 result = list(merge_sorted_iterables("def", "abc",))
 assert result == expected
def test_iterable():
 
 expected = [0, 1, 2, 3, 4, 5]
 result = list(merge_sorted_iterables(iter([0, 1, 2]), iter([3, 4, 5])))
 assert result == expected
 result = list(merge_sorted_iterables(iter([0, 2, 4]), iter([1, 3, 5])))
 assert result == expected
 result = list(merge_sorted_iterables(iter([3, 4, 5]), iter([0, 1, 2]),))
 assert result == expected
def test_comparable():
 with pytest.raises(TypeError, match="not supported between instances of"):
 list(merge_sorted_iterables([0, 1, 2], ["a", "b", "c"]))

descending

Once you have these test in place, you can easily expand the behaviour to also take descending iterables:

import operator
def merge_sorted_iterables(
 iterable1: typing.Iterable[T],
 iterable2: typing.Iterable[T],
 *,
 ascending: bool = True,
) -> typing.Iterable[T]:
 """Merge 2 sorted iterables.
 
 The items in the iterables need to be comparable.
 ...
 """
 done = object()
 iterator1 = iter(iterable1)
 iterator2 = iter(iterable2)
 item1 = next(iterator1, done)
 item2 = next(iterator2, done)
 comparison = operator.le if ascending else operator.ge
 while item1 is not done and item2 is not done:
 if comparison(item1, item2):
 yield item1
 item1 = next(iterator1, done)
 else:
 yield item2
 item2 = next(iterator2, done)
 if item1 is not done:
 yield item1
 yield from iterator1
 if item2 is not done:
 yield item2
 yield from iterator2

I added the ascending keyword as a keyword-only argument to avoid confusion and keep backwards compatibility

One of its tests:

def test_descending():
 expected = [5, 4, 3, 2, 1, 0]
 result = list(
 merge_sorted_iterables([2, 1, 0], [5, 4, 3], ascending=False)
 )
 assert result == expected
 result = list(
 merge_sorted_iterables([4, 2, 0], [5, 3, 1], ascending=False)
 )
 assert result == expected
 result = list(
 merge_sorted_iterables([5, 4, 3], [2, 1, 0], ascending=False)
 )
 assert result == expected

Question 8

wow this is great, thank you so much. you have no idea how much more i have learnt from your explanation and just this small example. thank you for taking the time, truly!

Question 9

Thanks for the tests, have been useful :-). Twice you wrote expected == result instead of result == expected, though. For consistency I think it would be better if all were the same.

Question 10

I disagree 100% with the "typing" section. The function is called "merge_sorted_iterables" and takes two arguments "iterable1" and "iterable2". Of course they're iterables, and of course it returns an iterable. You've added zero useful information, and broken the function signature over three lines, and completely counterfeit the beauty of Python's indentation.

Question 11

In this case the typing information is a bit superfluous. But it helps the IDE, and in other, less trivial cases it can have a lot of value

Question 12

Use a 4-space indent.

Don't repeatedly subtract 1 from the same unchanging value.

Simplify the compare conditional: just use else.

Take advantage of list.extend().

Drop the wrap-up conditionals: they aren't actually needed. Code like zs.extend(xs[xi:]) will work fine even if xi exceeds the list bounds.

Shorten the variable names to lighten the code weight and increase readability. There's no loss of meaning here, because all of the short names are quite conventional and make sense in a generic function like this.

def merge_arrays(xs, ys):
 # Setup.
 xmax = len(xs) - 1
 ymax = len(ys) - 1
 xi = 0
 yi = 0
 zs = []
 # Compare and merge.
 while xi <= xmax and yi <= ymax:
 if xs[xi] <= ys[yi]:
 zs.append(xs[xi])
 xi += 1
 else:
 zs.append(ys[yi])
 yi += 1
 # Merge any remainders and return.
 zs.extend(ys[yi:])
 zs.extend(xs[xi:])
 return zs

Last night I wrote an iterator-base solution but somehow forgot that next() supports a handy default argument: the code was awkward and Maarten Fabré did a nicer implementation. But if your willing to use more_itertools.peekable() you can achieve a simple, readable implementation. Thanks to superb-rain for an idea in the comments that helped me simplify further.

from more_itertools import peekable
def merge(xs, ys):
 xit = peekable(xs)
 yit = peekable(ys)
 while xit and yit:
 it = xit if xit.peek() <= yit.peek() else yit
 yield next(it)
 yield from (xit or yit)

Question 13

How about this?

Question 14

@superbrain Cool: good to know a peekable iterator can be checked for exhaustion in a simple if-test. I simplified my version even more.

Maarten Fabré Maarten Fabré 9,3901 gold badge15 silver badges27 bronze badges · Accepted Answer · 2020-08-31 10:07:56Z

various

merge_len is unused
the extra parentheses around the simple checks are unnecessary
l1_ptr <= len_list1-1 can be made more clearer as l1_ptr < len_list1
using the variable name l1_ptr to save a few characters while making it harder to guess from the name what it does is not useful

Working with the indices directly is not really pythonic indeed. You can make this more generic, using iter and next, and work for all iterables.

typing

add typing information:

import typing
T = typing.TypeVar("T")
def merge_sorted_iterables(
 iterable1: typing.Iterable[T], iterable2: typing.Iterable[T]
) -> typing.Iterable[T]:

This is extra explanation for the user of this function (and his IDE).

docstring

Add some explanation on what the method does, expects from the caller, and returns.

def merge_sorted_iterables(
 iterable1: typing.Iterable[T], iterable2: typing.Iterable[T]
) -> typing.Iterable[T]:
 """Merge 2 sorted iterables.
 
 The items in the iterables need to be comparable (and support `<=`).
 ...
 """

iterator

Instead of keeping track of the index you can use iter and next. You don't even need to add the items to a list, you can yield them, so the caller of the method can decide in what way he wants to use this.

done = object()
iterator1 = iter(iterable1)
iterator2 = iter(iterable2)
item1 = next(iterator1, done)
item2 = next(iterator2, done)
while item1 is not done and item2 is not done:
 if item1 <= item2:
 yield item1
 item1 = next(iterator1, done)
 else:
 yield item2
 item2 = next(iterator2, done)

Then all that needs to be done is continue the iterator that is not finished

 if item1 is not done:
 yield item1
 yield from iterator1
 if item2 is not done:
 yield item2
 yield from iterator2

import typing
T = typing.TypeVar("T")
def merge_sorted_iterables(
 iterable1: typing.Iterable[T], iterable2: typing.Iterable[T]
) -> typing.Iterable[T]:
 """Merge 2 sorted iterables.
 
 The items in the iterables need to be comparable (and support `<=`).
 ...
 """
 done = object()
 
 iterator1 = iter(iterable1)
 iterator2 = iter(iterable2)
 
 item1 = next(iterator1, done)
 item2 = next(iterator2, done)
 
 while item1 is not done and item2 is not done:
 if item1 <= item2:
 yield item1
 item1 = next(iterator1, done)
 else:
 yield item2
 item2 = next(iterator2, done)
 if item1 is not done:
 yield item1
 yield from iterator1
 if item2 is not done:
 yield item2
 yield from iterator2

testing

You can test the behaviour, starting with the simplest cases:

import pytest
def test_empty():
 expected = []
 result = list(merge_sorted_iterables([], []))
 assert result == expected
def test_single():
 expected = [0, 1, 2]
 result = list(merge_sorted_iterables([], range(3)))
 assert expected == result
 result = list(merge_sorted_iterables(range(3), [],))
 assert expected == result
def test_simple():
 expected = [0, 1, 2, 3, 4, 5]
 result = list(merge_sorted_iterables([0, 1, 2], [3, 4, 5]))
 assert result == expected
 result = list(merge_sorted_iterables([0, 2, 4], [1, 3, 5]))
 assert result == expected
 result = list(merge_sorted_iterables([3, 4, 5], [0, 1, 2],))
 assert result == expected
def test_string():
 expected = list("abcdef")
 result = list(merge_sorted_iterables("abc", "def"))
 assert result == expected
 result = list(merge_sorted_iterables("ace", "bdf"))
 assert result == expected
 result = list(merge_sorted_iterables("def", "abc",))
 assert result == expected
def test_iterable():
 
 expected = [0, 1, 2, 3, 4, 5]
 result = list(merge_sorted_iterables(iter([0, 1, 2]), iter([3, 4, 5])))
 assert result == expected
 result = list(merge_sorted_iterables(iter([0, 2, 4]), iter([1, 3, 5])))
 assert result == expected
 result = list(merge_sorted_iterables(iter([3, 4, 5]), iter([0, 1, 2]),))
 assert result == expected
def test_comparable():
 with pytest.raises(TypeError, match="not supported between instances of"):
 list(merge_sorted_iterables([0, 1, 2], ["a", "b", "c"]))

descending

Once you have these test in place, you can easily expand the behaviour to also take descending iterables:

import operator
def merge_sorted_iterables(
 iterable1: typing.Iterable[T],
 iterable2: typing.Iterable[T],
 *,
 ascending: bool = True,
) -> typing.Iterable[T]:
 """Merge 2 sorted iterables.
 
 The items in the iterables need to be comparable.
 ...
 """
 done = object()
 iterator1 = iter(iterable1)
 iterator2 = iter(iterable2)
 item1 = next(iterator1, done)
 item2 = next(iterator2, done)
 comparison = operator.le if ascending else operator.ge
 while item1 is not done and item2 is not done:
 if comparison(item1, item2):
 yield item1
 item1 = next(iterator1, done)
 else:
 yield item2
 item2 = next(iterator2, done)
 if item1 is not done:
 yield item1
 yield from iterator1
 if item2 is not done:
 yield item2
 yield from iterator2

I added the ascending keyword as a keyword-only argument to avoid confusion and keep backwards compatibility

One of its tests:

def test_descending():
 expected = [5, 4, 3, 2, 1, 0]
 result = list(
 merge_sorted_iterables([2, 1, 0], [5, 4, 3], ascending=False)
 )
 assert result == expected
 result = list(
 merge_sorted_iterables([4, 2, 0], [5, 3, 1], ascending=False)
 )
 assert result == expected
 result = list(
 merge_sorted_iterables([5, 4, 3], [2, 1, 0], ascending=False)
 )
 assert result == expected

wow this is great, thank you so much. you have no idea how much more i have learnt from your explanation and just this small example. thank you for taking the time, truly!
Thanks for the tests, have been useful :-). Twice you wrote expected == result instead of result == expected, though. For consistency I think it would be better if all were the same.
I disagree 100% with the "typing" section. The function is called "merge_sorted_iterables" and takes two arguments "iterable1" and "iterable2". Of course they're iterables, and of course it returns an iterable. You've added zero useful information, and broken the function signature over three lines, and completely counterfeit the beauty of Python's indentation.
In this case the typing information is a bit superfluous. But it helps the IDE, and in other, less trivial cases it can have a lot of value

Stack Exchange Network

Merging sorted arrays in Python

2 Answers 2

various

typing

docstring

iterator

testing

descending

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Linked

Hot Network Questions

Merging sorted arrays in Python

2 Answers 2

various

typing

docstring

iterator

testing

descending

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Linked

Related

Hot Network Questions