Split String Function Implementation in Python

Question 1

For learning purposes, I am trying to implement Python's built-in function split. I am new to Python, and I know that str.split is written in C. Here's the code:

import math
def split_helper(data: str, sep=','):
 if data.find(sep) >= 0:
 left=data[0:data.find(sep)]
 right=data[data.find(sep)+len(sep):len(data)]
 return (left, right)
def split_with_space(data: str, maxsplit=math.inf):
 data = data.lstrip()
 output = []
 temp_str = ""
 split_count = 0
 for i in range(0,len(data)):
 if(data[i] == ' ') and split_count < maxsplit:
 if temp_str!="":
 temp_str = temp_str.lstrip()
 output.append(temp_str)
 temp_str = ""
 split_count+=1
 else:
 temp_str+=data[i]
 temp_str = temp_str.lstrip()
 output.append(temp_str)
 temp_str = ""
 return output
def split(data: str, sep=' ', maxsplit=math.inf):
 data = data.lstrip()
 if data == '':
 return []
 if sep == ' ':
 return split_with_space(data, maxsplit)
 output = []
 split_count = 0
 data_tmp = data
 while data_tmp.find(sep) >= 0 and split_count < maxsplit:
 left, right = split_helper(data_tmp, sep)
 output.append(left)
 data_tmp = right
 split_count+=1
 output.append(data_tmp)
 return output
if __name__ == '__main__':
 # Comparing split functionality to the built-in one
 test_string = ''
 assert split(test_string) == test_string.split()
 test_string = ' '
 assert split(test_string) == test_string.split()
 test_string=',123,'
 assert split(test_string, sep=',') == test_string.split(sep=',')
 test_string='test'
 assert split(test_string) == test_string.split()
 test_string='Python 2 3'
 assert split(test_string, maxsplit=1) == test_string.split(maxsplit=1)
 test_string=' test 6 7'
 assert split(test_string, maxsplit=1) == test_string.split(maxsplit=1)
 test_string=' Hi 8 9'
 assert split(test_string, maxsplit=0) == test_string.split(maxsplit=0)
 test_string=' set 3 4'
 assert split(test_string) == test_string.split()
 test_string='set;:23'
 assert split(test_string, sep=';:', maxsplit=0) == test_string.split(sep=';:', maxsplit=0)
 test_string='set;:;:23'
 assert split(test_string, sep=';:', maxsplit=2) == test_string.split(sep=';:', maxsplit=2)

Python Online

Please check the implementation of the split function and its tests. If there is any suggestion, please let me know.

Question 2

Your function doesn't correctly mimic str.split:

>>> test = lambda s, *args, **kwargs: (split(s, *args, **kwargs), s.split(*args, **kwargs))
>>> test(" hello world ")
(['hello', 'world', ''], ['hello', 'world'])
>>> test(" hello world ", " ")
(['hello', 'world', ''], ['', 'hello', '', 'world', ''])
>>> test("foo\nbar")
(['foo\nbar'], ['foo', 'bar'])

I think building a list of indexes to split is simpler to work with. Assuming we have a function to build the indexes:
1. We can limit the indexes by maxsplit.
  Using itertools.islice if the object we're interacting with is an Iterable rather than a Sequence.
2. We can add the start and end of the data to the indexes to include all the data.
3. We can iterate through the indexes slicing the data from the end and start index.
```
def split_(data: str, sep: str | None = None, maxsplit: int = -1) -> list[str]:
 indexes = [(3, 4)] # replace with code to generate indexes
 indexes = itertools.islice(indexes, None if -1 == maxsplit else maxsplit)
 indexes = itertools.chain([(0, 0)], indexes, [(len(data), len(data))])
 return [
 data[prev : curr]
 for (_, prev), (curr, _) in itertools.pairwise(indexes)
 ]
```

The function to generate the indexes is basically what you already wrote:

def indexes__find(data: str, sep: str) -> Iterator[tuple[int, int]]:
 size = len(sep)
 i = 0
 while 0 <= (i := data.find(sep, i)):
 yield (i, i + size)
 i += size

def indexes__find(data: str, sep: str) -> Iterator[tuple[int, int]]:
 size = len(sep)
 i = 0
 while 0 <= (i := data.find(sep, i)):
 yield (i, i + size)
 i += size
def split(data: str, sep: str | None = None, maxsplit: int = -1) -> list[str]:
 indexes = indexes__find(data, sep) # NOTE: doesn't work with sep=None
 indexes = itertools.islice(indexes, None if -1 == maxsplit else maxsplit)
 indexes = itertools.chain([(0, 0)], indexes, [(len(data), len(data))])
 return [
 data[prev : curr]
 for (_, prev), (curr, _) in itertools.pairwise(indexes)
 ]

To get sep=None working is a little more tricky. But the algorithm is still fairly simple:

For each character in the data call a function to determine if the char is a split.

def indexes__match(data: str, match: Callable[[str], bool]) -> Iterator[tuple[int, int]]:
 return (
 (i, i + 1)
 for i in range(len(data))
 if match(data[i])
 )

We then want to merge consecutive runs of splits together.

def merge_indexes(indexes: Iterator[tuple[int, int]]) -> Iterator[tuple[int, int]]:
 if (prev := next(indexes, None)) is None:
 return
 for curr in indexes:
 if prev[-1] == curr[0]:
 prev = prev[0], curr[1]
 else:
 yield prev
 prev = curr
 yield prev

The mutations to the indexes are a little more involved. Basically we need to add the start index, limit the splits and then add the end index with a merge.

indexes = indexes__match(data, str.isspace)
indexes = itertools.chain([(0, 0)], indexes)
indexes = merge_indexes(indexes)
indexes = itertools.islice(indexes, None if -1 == maxsplit else maxsplit + 1)
indexes = itertools.chain(indexes, [(len(data), len(data))])
indexes = merge_indexes(indexes)

from collections.abc import Callable, Iterator
import itertools
def indexes__find(data: str, sep: str) -> Iterator[tuple[int, int]]:
 size = len(sep)
 i = 0
 while 0 <= (i := data.find(sep, i)):
 yield (i, i + size)
 i += size
def indexes__match(data: str, match: Callable[[str], bool]) -> Iterator[tuple[int, int]]:
 return (
 (i, i + 1)
 for i in range(len(data))
 if match(data[i])
 )
def merge_indexes(indexes: Iterator[tuple[int, int]]) -> Iterator[tuple[int, int]]:
 if (prev := next(indexes, None)) is None:
 return
 for curr in indexes:
 if prev[-1] == curr[0]:
 prev = prev[0], curr[1]
 else:
 yield prev
 prev = curr
 yield prev
def split(data: str, sep: str | None = None, maxsplit: int = -1) -> list[str]:
 if sep is None:
 indexes = indexes__match(data, str.isspace)
 indexes = itertools.chain([(0, 0)], indexes)
 indexes = merge_indexes(indexes)
 indexes = itertools.islice(indexes, None if -1 == maxsplit else maxsplit + 1)
 indexes = itertools.chain(indexes, [(len(data), len(data))])
 indexes = merge_indexes(indexes)
 else:
 indexes = indexes__find(data, sep)
 indexes = itertools.islice(indexes, None if -1 == maxsplit else maxsplit)
 indexes = itertools.chain([(0, 0)], indexes, [(len(data), len(data))])
 return [
 data[prev : curr]
 for (_, prev), (curr, _) in itertools.pairwise(indexes)
 ]

We can see the split out functions are really quite simple, and all the implementation details are stored in split. split isn't the easiest to read code, but each line is fairly straight forward.

Question 3

Sorry, I am wondering where Iterator come from. I tried to run the code and got NameError: name 'Iterator' is not defined...

Question 4

@JimmyHu from typing import Iterator (or from collections.abc import Iterator for python 3.9 and above). Would be nice to edit that into answer. @Peliokrays Could you elaborate on indexes__find naming pattern - is that a typo or deliberate decision? It's quite unusual identifier name for Python, usually there is one underscore separating words.

Question 5

@SUTerliakov foo__bar is taken from CSS. You'll notice indexes__match too. Both are different algorithms for indexes, making a static class/module isn't worth wrapping everything up in a Indexes.match and Indexes.find.

Question 6

Thank you for showing how to run the code with all the tests at the end.

The code layout is good, and you did a good job partitioning code into functions.

Here are some minor suggestions, mostly for coding style.

Documentation

It would be helpful to add docstrings for the functions describing their purpose and inputs and outputs.

Readability

The code would be easier to read with space around the operators:

=
!=
+=
:

The black program can be used to do this automatically for you.

Simpler

This line:

for i in range(0,len(data)):

is more simply written as:

for i in range(len(data)):

Consistency

There is inconsistent usage of parentheses here:

 if(data[i] == ' ') and split_count < maxsplit:

Either use parens for both expressions or for neither, for example:

 if data[i] == ' ' and split_count < maxsplit:

Question 7

Simpler is not simpler enough. i is only used to access data[i]. Much simpler is for d in data. Always think twice before writing for i in range.

Question 8

@vnp: Ha! You're right... whenever I see range(0,, I think simpler, then apparently I stop thinking :)

Question 9

Store the result of inefficient functions

You call find three times in split_helper and once more in split to decide whether to call split_helper. If your data string is very long, this could be very inefficient. This is a perfect place to deploy a walrus operator:

def split_helper(data: str, sep=','):
 if (sepindex := data.find(sep)) >= 0:
 left=data[0: sepindex]
 right=data[sepindex + len(sep): len(data)]
 return (left, right)

`split_helper` will raise an exception if `data` does not contain `sep`

This could be a perfectly valid design choice, but as it is currently written, if data does not contain a sep character, the function will raise a NameError, which is not a sensible result here. An IndexError would be much more standard:

def split_helper(data: str, sep=','):
 if (sepindex := data.find(sep)) >= 0:
 left=data[0: sepindex]
 right=data[sepindex + len(sep): len(data)]
 return (left, right)
 raise IndexError(f"String '{data}' does not contain separator '{sep}')

edit: I originally had this as a ValueError, which I think is also valid (data is a bad value), but IndexError is probably the best match (sep is not in data). The important thing is that NameError is always the result of a bug, not something that a user should ever have to check for.

That's a totally valid implementation, but will be a little cumbersome to use. Within split, you'd want something like

 while split_count < maxsplit:
 try:
 left, right = split_helper(data_tmp, sep)
 except ValueError:
 break
 output.append(left)
 data_tmp = right
 split_count += 1

You could instead make split_helper never raise an exception, but return a special value when data does not contain sep, which would make split a little more elegant:

def split_helper(data: str, sep=','):
 if (sepindex := data.find(sep)) >= 0:
 left=data[0: sepindex]
 right=data[sepindex + len(sep): len(data)]
 return (left, right)
 return (data, None)
def split(...):
 ...
 while data_tmp and split_count < maxsplit:
 left, data_tmp = split(helper(data_tmp, sep)
 output.append(left)
 split_count += 1

Peilonrayz ♦Peilonrayz 44.4k7 gold badges80 silver badges157 bronze badges · Accepted Answer · 2024-04-09 19:27:04Z

Your function doesn't correctly mimic str.split:

>>> test = lambda s, *args, **kwargs: (split(s, *args, **kwargs), s.split(*args, **kwargs))
>>> test(" hello world ")
(['hello', 'world', ''], ['hello', 'world'])
>>> test(" hello world ", " ")
(['hello', 'world', ''], ['', 'hello', '', 'world', ''])
>>> test("foo\nbar")
(['foo\nbar'], ['foo', 'bar'])

I think building a list of indexes to split is simpler to work with. Assuming we have a function to build the indexes:
1. We can limit the indexes by maxsplit.
  Using itertools.islice if the object we're interacting with is an Iterable rather than a Sequence.
2. We can add the start and end of the data to the indexes to include all the data.
3. We can iterate through the indexes slicing the data from the end and start index.
```
def split_(data: str, sep: str | None = None, maxsplit: int = -1) -> list[str]:
 indexes = [(3, 4)] # replace with code to generate indexes
 indexes = itertools.islice(indexes, None if -1 == maxsplit else maxsplit)
 indexes = itertools.chain([(0, 0)], indexes, [(len(data), len(data))])
 return [
 data[prev : curr]
 for (_, prev), (curr, _) in itertools.pairwise(indexes)
 ]
```

The function to generate the indexes is basically what you already wrote:

def indexes__find(data: str, sep: str) -> Iterator[tuple[int, int]]:
 size = len(sep)
 i = 0
 while 0 <= (i := data.find(sep, i)):
 yield (i, i + size)
 i += size

def indexes__find(data: str, sep: str) -> Iterator[tuple[int, int]]:
 size = len(sep)
 i = 0
 while 0 <= (i := data.find(sep, i)):
 yield (i, i + size)
 i += size
def split(data: str, sep: str | None = None, maxsplit: int = -1) -> list[str]:
 indexes = indexes__find(data, sep) # NOTE: doesn't work with sep=None
 indexes = itertools.islice(indexes, None if -1 == maxsplit else maxsplit)
 indexes = itertools.chain([(0, 0)], indexes, [(len(data), len(data))])
 return [
 data[prev : curr]
 for (_, prev), (curr, _) in itertools.pairwise(indexes)
 ]

To get sep=None working is a little more tricky. But the algorithm is still fairly simple:

For each character in the data call a function to determine if the char is a split.

def indexes__match(data: str, match: Callable[[str], bool]) -> Iterator[tuple[int, int]]:
 return (
 (i, i + 1)
 for i in range(len(data))
 if match(data[i])
 )

We then want to merge consecutive runs of splits together.

def merge_indexes(indexes: Iterator[tuple[int, int]]) -> Iterator[tuple[int, int]]:
 if (prev := next(indexes, None)) is None:
 return
 for curr in indexes:
 if prev[-1] == curr[0]:
 prev = prev[0], curr[1]
 else:
 yield prev
 prev = curr
 yield prev

The mutations to the indexes are a little more involved. Basically we need to add the start index, limit the splits and then add the end index with a merge.

indexes = indexes__match(data, str.isspace)
indexes = itertools.chain([(0, 0)], indexes)
indexes = merge_indexes(indexes)
indexes = itertools.islice(indexes, None if -1 == maxsplit else maxsplit + 1)
indexes = itertools.chain(indexes, [(len(data), len(data))])
indexes = merge_indexes(indexes)

from collections.abc import Callable, Iterator
import itertools
def indexes__find(data: str, sep: str) -> Iterator[tuple[int, int]]:
 size = len(sep)
 i = 0
 while 0 <= (i := data.find(sep, i)):
 yield (i, i + size)
 i += size
def indexes__match(data: str, match: Callable[[str], bool]) -> Iterator[tuple[int, int]]:
 return (
 (i, i + 1)
 for i in range(len(data))
 if match(data[i])
 )
def merge_indexes(indexes: Iterator[tuple[int, int]]) -> Iterator[tuple[int, int]]:
 if (prev := next(indexes, None)) is None:
 return
 for curr in indexes:
 if prev[-1] == curr[0]:
 prev = prev[0], curr[1]
 else:
 yield prev
 prev = curr
 yield prev
def split(data: str, sep: str | None = None, maxsplit: int = -1) -> list[str]:
 if sep is None:
 indexes = indexes__match(data, str.isspace)
 indexes = itertools.chain([(0, 0)], indexes)
 indexes = merge_indexes(indexes)
 indexes = itertools.islice(indexes, None if -1 == maxsplit else maxsplit + 1)
 indexes = itertools.chain(indexes, [(len(data), len(data))])
 indexes = merge_indexes(indexes)
 else:
 indexes = indexes__find(data, sep)
 indexes = itertools.islice(indexes, None if -1 == maxsplit else maxsplit)
 indexes = itertools.chain([(0, 0)], indexes, [(len(data), len(data))])
 return [
 data[prev : curr]
 for (_, prev), (curr, _) in itertools.pairwise(indexes)
 ]

We can see the split out functions are really quite simple, and all the implementation details are stored in split. split isn't the easiest to read code, but each line is fairly straight forward.

Sorry, I am wondering where Iterator come from. I tried to run the code and got NameError: name 'Iterator' is not defined...
@JimmyHu from typing import Iterator (or from collections.abc import Iterator for python 3.9 and above). Would be nice to edit that into answer. @Peliokrays Could you elaborate on indexes__find naming pattern - is that a typo or deliberate decision? It's quite unusual identifier name for Python, usually there is one underscore separating words.
@SUTerliakov foo__bar is taken from CSS. You'll notice indexes__match too. Both are different algorithms for indexes, making a static class/module isn't worth wrapping everything up in a Indexes.match and Indexes.find.

Stack Exchange Network

Split String Function Implementation in Python

3 Answers 3

Documentation

Readability

Simpler

Consistency

Store the result of inefficient functions

`split_helper` will raise an exception if `data` does not contain `sep`

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Split String Function Implementation in Python

3 Answers 3

Documentation

Readability

Simpler

Consistency

Store the result of inefficient functions

split_helper will raise an exception if data does not contain sep

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions

`split_helper` will raise an exception if `data` does not contain `sep`