For learning purposes, I am trying to implement Python's built-in function split
. I am new to Python, and I know that str.split
is written in C. Here's the code:
import math
def split_helper(data: str, sep=','):
if data.find(sep) >= 0:
left=data[0:data.find(sep)]
right=data[data.find(sep)+len(sep):len(data)]
return (left, right)
def split_with_space(data: str, maxsplit=math.inf):
data = data.lstrip()
output = []
temp_str = ""
split_count = 0
for i in range(0,len(data)):
if(data[i] == ' ') and split_count < maxsplit:
if temp_str!="":
temp_str = temp_str.lstrip()
output.append(temp_str)
temp_str = ""
split_count+=1
else:
temp_str+=data[i]
temp_str = temp_str.lstrip()
output.append(temp_str)
temp_str = ""
return output
def split(data: str, sep=' ', maxsplit=math.inf):
data = data.lstrip()
if data == '':
return []
if sep == ' ':
return split_with_space(data, maxsplit)
output = []
split_count = 0
data_tmp = data
while data_tmp.find(sep) >= 0 and split_count < maxsplit:
left, right = split_helper(data_tmp, sep)
output.append(left)
data_tmp = right
split_count+=1
output.append(data_tmp)
return output
if __name__ == '__main__':
# Comparing split functionality to the built-in one
test_string = ''
assert split(test_string) == test_string.split()
test_string = ' '
assert split(test_string) == test_string.split()
test_string=',123,'
assert split(test_string, sep=',') == test_string.split(sep=',')
test_string='test'
assert split(test_string) == test_string.split()
test_string='Python 2 3'
assert split(test_string, maxsplit=1) == test_string.split(maxsplit=1)
test_string=' test 6 7'
assert split(test_string, maxsplit=1) == test_string.split(maxsplit=1)
test_string=' Hi 8 9'
assert split(test_string, maxsplit=0) == test_string.split(maxsplit=0)
test_string=' set 3 4'
assert split(test_string) == test_string.split()
test_string='set;:23'
assert split(test_string, sep=';:', maxsplit=0) == test_string.split(sep=';:', maxsplit=0)
test_string='set;:;:23'
assert split(test_string, sep=';:', maxsplit=2) == test_string.split(sep=';:', maxsplit=2)
Please check the implementation of the split
function and its tests. If there is any suggestion, please let me know.
3 Answers 3
Your function doesn't correctly mimic
str.split
:>>> test = lambda s, *args, **kwargs: (split(s, *args, **kwargs), s.split(*args, **kwargs)) >>> test(" hello world ") (['hello', 'world', ''], ['hello', 'world']) >>> test(" hello world ", " ") (['hello', 'world', ''], ['', 'hello', '', 'world', '']) >>> test("foo\nbar") (['foo\nbar'], ['foo', 'bar'])
I think building a list of indexes to split is simpler to work with. Assuming we have a function to build the indexes:
We can limit the indexes by
maxsplit
.
Usingitertools.islice
if the object we're interacting with is anIterable
rather than aSequence
.We can add the start and end of the data to the indexes to include all the data.
We can iterate through the indexes slicing the data from the end and start index.
def split_(data: str, sep: str | None = None, maxsplit: int = -1) -> list[str]: indexes = [(3, 4)] # replace with code to generate indexes indexes = itertools.islice(indexes, None if -1 == maxsplit else maxsplit) indexes = itertools.chain([(0, 0)], indexes, [(len(data), len(data))]) return [ data[prev : curr] for (_, prev), (curr, _) in itertools.pairwise(indexes) ]
The function to generate the indexes is basically what you already wrote:
def indexes__find(data: str, sep: str) -> Iterator[tuple[int, int]]: size = len(sep) i = 0 while 0 <= (i := data.find(sep, i)): yield (i, i + size) i += size
def indexes__find(data: str, sep: str) -> Iterator[tuple[int, int]]:
size = len(sep)
i = 0
while 0 <= (i := data.find(sep, i)):
yield (i, i + size)
i += size
def split(data: str, sep: str | None = None, maxsplit: int = -1) -> list[str]:
indexes = indexes__find(data, sep) # NOTE: doesn't work with sep=None
indexes = itertools.islice(indexes, None if -1 == maxsplit else maxsplit)
indexes = itertools.chain([(0, 0)], indexes, [(len(data), len(data))])
return [
data[prev : curr]
for (_, prev), (curr, _) in itertools.pairwise(indexes)
]
To get sep=None
working is a little more tricky. But the algorithm is still fairly simple:
For each character in the data call a function to determine if the char is a split.
def indexes__match(data: str, match: Callable[[str], bool]) -> Iterator[tuple[int, int]]: return ( (i, i + 1) for i in range(len(data)) if match(data[i]) )
We then want to merge consecutive runs of splits together.
def merge_indexes(indexes: Iterator[tuple[int, int]]) -> Iterator[tuple[int, int]]: if (prev := next(indexes, None)) is None: return for curr in indexes: if prev[-1] == curr[0]: prev = prev[0], curr[1] else: yield prev prev = curr yield prev
The mutations to the indexes are a little more involved. Basically we need to add the start index, limit the splits and then add the end index with a merge.
indexes = indexes__match(data, str.isspace) indexes = itertools.chain([(0, 0)], indexes) indexes = merge_indexes(indexes) indexes = itertools.islice(indexes, None if -1 == maxsplit else maxsplit + 1) indexes = itertools.chain(indexes, [(len(data), len(data))]) indexes = merge_indexes(indexes)
from collections.abc import Callable, Iterator
import itertools
def indexes__find(data: str, sep: str) -> Iterator[tuple[int, int]]:
size = len(sep)
i = 0
while 0 <= (i := data.find(sep, i)):
yield (i, i + size)
i += size
def indexes__match(data: str, match: Callable[[str], bool]) -> Iterator[tuple[int, int]]:
return (
(i, i + 1)
for i in range(len(data))
if match(data[i])
)
def merge_indexes(indexes: Iterator[tuple[int, int]]) -> Iterator[tuple[int, int]]:
if (prev := next(indexes, None)) is None:
return
for curr in indexes:
if prev[-1] == curr[0]:
prev = prev[0], curr[1]
else:
yield prev
prev = curr
yield prev
def split(data: str, sep: str | None = None, maxsplit: int = -1) -> list[str]:
if sep is None:
indexes = indexes__match(data, str.isspace)
indexes = itertools.chain([(0, 0)], indexes)
indexes = merge_indexes(indexes)
indexes = itertools.islice(indexes, None if -1 == maxsplit else maxsplit + 1)
indexes = itertools.chain(indexes, [(len(data), len(data))])
indexes = merge_indexes(indexes)
else:
indexes = indexes__find(data, sep)
indexes = itertools.islice(indexes, None if -1 == maxsplit else maxsplit)
indexes = itertools.chain([(0, 0)], indexes, [(len(data), len(data))])
return [
data[prev : curr]
for (_, prev), (curr, _) in itertools.pairwise(indexes)
]
We can see the split out functions are really quite simple, and all the implementation details are stored in split
. split
isn't the easiest to read code, but each line is fairly straight forward.
-
\$\begingroup\$ Sorry, I am wondering where
Iterator
come from. I tried to run the code and gotNameError: name 'Iterator' is not defined
... \$\endgroup\$JimmyHu– JimmyHu2024年04月10日 13:48:17 +00:00Commented Apr 10, 2024 at 13:48 -
\$\begingroup\$ @JimmyHu
from typing import Iterator
(orfrom collections.abc import Iterator
for python 3.9 and above). Would be nice to edit that into answer. @Peliokrays Could you elaborate onindexes__find
naming pattern - is that a typo or deliberate decision? It's quite unusual identifier name for Python, usually there is one underscore separating words. \$\endgroup\$STerliakov– STerliakov2024年04月10日 18:28:59 +00:00Commented Apr 10, 2024 at 18:28 -
1\$\begingroup\$ @SUTerliakov
foo__bar
is taken from CSS. You'll noticeindexes__match
too. Both are different algorithms forindexes
, making a static class/module isn't worth wrapping everything up in aIndexes.match
andIndexes.find
. \$\endgroup\$2024年04月10日 19:36:58 +00:00Commented Apr 10, 2024 at 19:36
Thank you for showing how to run the code with all the tests at the end.
The code layout is good, and you did a good job partitioning code into functions.
Here are some minor suggestions, mostly for coding style.
Documentation
It would be helpful to add docstrings for the functions describing their purpose and inputs and outputs.
Readability
The code would be easier to read with space around the operators:
=
!=
+=
:
The black program can be used to do this automatically for you.
Simpler
This line:
for i in range(0,len(data)):
is more simply written as:
for i in range(len(data)):
Consistency
There is inconsistent usage of parentheses here:
if(data[i] == ' ') and split_count < maxsplit:
Either use parens for both expressions or for neither, for example:
if data[i] == ' ' and split_count < maxsplit:
-
3\$\begingroup\$ Simpler is not simpler enough.
i
is only used to accessdata[i]
. Much simpler isfor d in data
. Always think twice before writingfor i in range
. \$\endgroup\$vnp– vnp2024年04月09日 23:19:30 +00:00Commented Apr 9, 2024 at 23:19 -
2\$\begingroup\$ @vnp: Ha! You're right... whenever I see
range(0,
, I think simpler, then apparently I stop thinking :) \$\endgroup\$toolic– toolic2024年04月09日 23:23:07 +00:00Commented Apr 9, 2024 at 23:23
Store the result of inefficient functions
You call find
three times in split_helper
and once more in split
to decide whether to call split_helper
. If your data string is very long, this could be very inefficient. This is a perfect place to deploy a walrus operator:
def split_helper(data: str, sep=','):
if (sepindex := data.find(sep)) >= 0:
left=data[0: sepindex]
right=data[sepindex + len(sep): len(data)]
return (left, right)
split_helper
will raise an exception if data
does not contain sep
This could be a perfectly valid design choice, but as it is currently written, if data does not contain a sep
character, the function will raise a NameError
, which is not a sensible result here. An IndexError
would be much more standard:
def split_helper(data: str, sep=','):
if (sepindex := data.find(sep)) >= 0:
left=data[0: sepindex]
right=data[sepindex + len(sep): len(data)]
return (left, right)
raise IndexError(f"String '{data}' does not contain separator '{sep}')
edit: I originally had this as a ValueError
, which I think is also valid (data
is a bad value), but IndexError
is probably the best match (sep
is not in data
). The important thing is that NameError
is always the result of a bug, not something that a user should ever have to check for.
That's a totally valid implementation, but will be a little cumbersome to use. Within split
, you'd want something like
while split_count < maxsplit:
try:
left, right = split_helper(data_tmp, sep)
except ValueError:
break
output.append(left)
data_tmp = right
split_count += 1
You could instead make split_helper
never raise an exception, but return a special value when data
does not contain sep
, which would make split
a little more elegant:
def split_helper(data: str, sep=','):
if (sepindex := data.find(sep)) >= 0:
left=data[0: sepindex]
right=data[sepindex + len(sep): len(data)]
return (left, right)
return (data, None)
def split(...):
...
while data_tmp and split_count < maxsplit:
left, data_tmp = split(helper(data_tmp, sep)
output.append(left)
split_count += 1
Explore related questions
See similar questions with these tags.