Commit 82965d5

authored

Add normalize_chunksize and partition utility functions (#47)

1 parent edf4f06 commit 82965d5Copy full SHA for 82965d5

File tree

6 files changed

+181

-30

lines changed

.pre-commit-config.yaml
graphblas_algorithms/nxapi
- _utils.py
- cluster.py
- shortest_paths
  - weighted.py
- tests
  - test_utils.py
pyproject.toml

6 files changed

+181

-30

lines changed

`‎.pre-commit-config.yaml`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -71,7 +71,7 @@ repos:`
`71`	`71`	`additional_dependencies: [tomli]`
`72`	`72`	`files: ^(graphblas_algorithms\|docs)/`
`73`	`73`	`- repo: https://github.com/charliermarsh/ruff-pre-commit`
`74`		`- rev: v0.0.249`
	`74`	`+ rev: v0.0.252`
`75`	`75`	`hooks:`
`76`	`76`	`- id: ruff`
`77`	`77`	`- repo: https://github.com/pre-commit/pre-commit-hooks`

`‎graphblas_algorithms/nxapi/_utils.py`

Lines changed: 127 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,127 @@`
	`1`	`+from math import ceil`
	`2`	`+from numbers import Number`
	`3`	`+`
	`4`	`+try:`
	`5`	`+ from itertools import pairwise # Added in Python 3.10`
	`6`	`+except ImportError:`
	`7`	`+`
	`8`	`+ def pairwise(it):`
	`9`	`+ it = iter(it)`
	`10`	`+ for prev in it:`
	`11`	`+ for cur in it:`
	`12`	`+ yield (prev, cur)`
	`13`	`+ prev = cur`
	`14`	`+`
	`15`	`+`
	`16`	`+BYTES_UNITS = {`
	`17`	`+ "": 1,`
	`18`	`+ "b": 1,`
	`19`	`+ "kb": 1000,`
	`20`	`+ "mb": 1000**2,`
	`21`	`+ "gb": 1000**3,`
	`22`	`+ "tb": 1000**4,`
	`23`	`+ "pb": 1000**5,`
	`24`	`+ "eb": 1000**6,`
	`25`	`+ "zb": 1000**7,`
	`26`	`+ "kib": 1024,`
	`27`	`+ "mib": 1024**2,`
	`28`	`+ "gib": 1024**3,`
	`29`	`+ "tib": 1024**4,`
	`30`	`+ "pib": 1024**5,`
	`31`	`+ "eib": 1024**6,`
	`32`	`+ "zib": 1024**7,`
	`33`	`+}`
	`34`	`+`
	`35`	`+`
	`36`	`+def normalize_chunksize(chunksize, itemsize=1, N=None):`
	`37`	`+ if chunksize is None:`
	`38`	`+ return None`
	`39`	`+ if isinstance(chunksize, Number):`
	`40`	`+ rv = int(chunksize)`
	`41`	`+ if rv <= 0 or N is not None and rv >= N:`
	`42`	`+ return None`
	`43`	`+ return rv`
	`44`	`+ if not isinstance(chunksize, str):`
	`45`	`+ raise TypeError(f"chunksize must be a number or a string; got {type(chunksize)}")`
	`46`	`+ chunkstring = chunksize.replace(" ", "").replace("_", "").lower()`
	`47`	`+ if not chunkstring or chunkstring == "all":`
	`48`	`+ return None`
	`49`	`+ for i, c in enumerate(reversed(chunkstring)):`
	`50`	`+ if c.isdigit():`
	`51`	`+ index = len(chunkstring) - i`
	`52`	`+ break`
	`53`	`+ else:`
	`54`	`+ chunkstring = f"1{chunkstring}"`
	`55`	`+ index = 1`
	`56`	`+`
	`57`	`+ prefix = chunkstring[:index]`
	`58`	`+ suffix = chunkstring[index:]`
	`59`	`+`
	`60`	`+ try:`
	`61`	`+ number = float(prefix)`
	`62`	`+ except ValueError as exc:`
	`63`	`+ raise ValueError(`
	`64`	`+ f"Bad chunksize: {chunksize!r}. Could not interpret {prefix!r} as a number."`
	`65`	`+ ) from exc`
	`66`	`+`
	`67`	`+ if suffix in {"chunk", "chunks"}:`
	`68`	`+ if number <= 1:`
	`69`	`+ return None`
	`70`	`+ if N is None:`
	`71`	`+ raise TypeError(`
	`72`	`+ f"N argument is required to determine chunksize to split into {int(number)} chunks"`
	`73`	`+ )`
	`74`	`+ rv = ceil(N / number)`
	`75`	`+ else:`
	`76`	`+ scale = BYTES_UNITS.get(suffix)`
	`77`	`+ if scale is None:`
	`78`	`+ raise ValueError(`
	`79`	`+ f"Bad chunksize: {chunksize!r}. Could not interpret {suffix!r} as a bytes unit."`
	`80`	`+ )`
	`81`	`+ number *= scale`
	`82`	`+ if chunkstring[-1] == "b":`
	`83`	`+ number = max(1, number / itemsize)`
	`84`	`+ rv = int(round(number))`
	`85`	`+ if rv <= 0 or N is not None and rv >= N:`
	`86`	`+ return None`
	`87`	`+ return rv`
	`88`	`+`
	`89`	`+`
	`90`	`+def partition(chunksize, L, *, evenly=True):`
	`91`	`+ """Partition a list into chunks"""`
	`92`	`+ N = len(L)`
	`93`	`+ if N == 0:`
	`94`	`+ return`
	`95`	`+ chunksize = int(chunksize)`
	`96`	`+ if chunksize <= 0 or chunksize >= N:`
	`97`	`+ yield L`
	`98`	`+ return`
	`99`	`+ if chunksize == 1:`
	`100`	`+ yield from L`
	`101`	`+ return`
	`102`	`+ if evenly:`
	`103`	`+ k = ceil(L / chunksize)`
	`104`	`+ if k * chunksize != N:`
	`105`	`+ yield from split_evenly(k, L)`
	`106`	`+ return`
	`107`	`+ for start, stop in pairwise(range(0, N + chunksize, chunksize)):`
	`108`	`+ yield L[start:stop]`
	`109`	`+`
	`110`	`+`
	`111`	`+def split_evenly(k, L):`
	`112`	`+ """Split a list into approximately-equal parts"""`
	`113`	`+ N = len(L)`
	`114`	`+ if N == 0:`
	`115`	`+ return`
	`116`	`+ k = int(k)`
	`117`	`+ if k <= 1:`
	`118`	`+ yield L`
	`119`	`+ return`
	`120`	`+ start = 0`
	`121`	`+ for i in range(1, k):`
	`122`	`+ stop = (N * i + k - 1) // k`
	`123`	`+ if stop != start:`
	`124`	`+ yield L[start:stop]`
	`125`	`+ start = stop`
	`126`	`+ if stop != N:`
	`127`	`+ yield L[stop:]`

`‎graphblas_algorithms/nxapi/cluster.py`

Lines changed: 14 additions & 16 deletions

Original file line number	Diff line number	Diff line change
`@@ -5,6 +5,8 @@`
`5`	`5`	`from graphblas_algorithms.classes.graph import to_undirected_graph`
`6`	`6`	`from graphblas_algorithms.utils import not_implemented_for`
`7`	`7`
	`8`	`+from ._utils import normalize_chunksize, partition`
	`9`	`+`
`8`	`10`	`__all__ = [`
`9`	`11`	`"triangles",`
`10`	`12`	`"transitivity",`
`@@ -90,11 +92,11 @@ def _split(L, k):`
`90`	`92`
`91`	`93`
`92`	`94`	`# TODO: should this move into algorithms?`
`93`		`-def _square_clustering_split(G, node_ids=None, *, nsplits):`
	`95`	`+def _square_clustering_split(G, node_ids=None, *, chunksize):`
`94`	`96`	`if node_ids is None:`
`95`	`97`	`node_ids, _ = G._A.reduce_rowwise(monoid.any).to_coo(values=False)`
`96`	`98`	`result = None`
`97`		`- for chunk_ids in _split(node_ids, nsplits):`
	`99`	`+ for chunk_ids in partition(chunksize, node_ids):`
`98`	`100`	`res = algorithms.square_clustering(G, chunk_ids)`
`99`	`101`	`if result is None:`
`100`	`102`	`result = res`
`@@ -103,36 +105,32 @@ def _square_clustering_split(G, node_ids=None, *, nsplits):`
`103`	`105`	`return result`
`104`	`106`
`105`	`107`
`106`		`-def square_clustering(G, nodes=None, *, nsplits="auto"):`
`107`		- # `nsplits` is used to split the computation into chunks.
	`108`	`+def square_clustering(G, nodes=None, *, chunksize="256 MiB"):`
	`109`	+ # `chunksize` is used to split the computation into chunks.
`108`	`110`	# square_clustering computes `A @ A`, which can get very large, even dense.
`109`		- # The default `nsplits` is to choose the number so that `Asubset @ A`
	`111`	+ # The default `chunksize` is to choose the number so that `Asubset @ A`
`110`	`112`	`# will be about 256 MB if dense.`
`111`	`113`	`G = to_undirected_graph(G)`
`112`	`114`	`if len(G) == 0:`
`113`	`115`	`return {}`
`114`		`- if nsplits == "auto":`
`115`		`- # TODO: make a utility function for this that can be reused`
`116`		- # Also, should we use `chunksize` instead of `nsplits`?
`117`		`- targetsize = 256 * 1024 * 1024 # 256 MB`
`118`		`- nsplits = len(G) ** 2 * G._A.dtype.np_type.itemsize // targetsize`
`119`		`- if nsplits <= 1:`
`120`		`- nsplits = None`
	`116`	`+`
	`117`	`+ chunksize = normalize_chunksize(chunksize, len(G) * G._A.dtype.np_type.itemsize, len(G))`
	`118`	`+`
`121`	`119`	`if nodes is None:`
`122`	`120`	`# Should we use this one for subsets of nodes as well?`
`123`		`- if nsplits is None:`
	`121`	`+ if chunksize is None:`
`124`	`122`	`result = algorithms.square_clustering(G)`
`125`	`123`	`else:`
`126`		`- result = _square_clustering_split(G, nsplits=nsplits)`
	`124`	`+ result = _square_clustering_split(G, chunksize=chunksize)`
`127`	`125`	`return G.vector_to_nodemap(result, fill_value=0)`
`128`	`126`	`if nodes in G:`
`129`	`127`	`idx = G._key_to_id[nodes]`
`130`	`128`	`return algorithms.single_square_clustering(G, idx)`
`131`	`129`	`ids = G.list_to_ids(nodes)`
`132`		`- if nsplits is None:`
	`130`	`+ if chunksize is None:`
`133`	`131`	`result = algorithms.square_clustering(G, ids)`
`134`	`132`	`else:`
`135`		`- result = _square_clustering_split(G, ids, nsplits=nsplits)`
	`133`	`+ result = _square_clustering_split(G, ids, chunksize=chunksize)`
`136`	`134`	`return G.vector_to_nodemap(result)`
`137`	`135`
`138`	`136`

`‎graphblas_algorithms/nxapi/shortest_paths/weighted.py`

Lines changed: 5 additions & 13 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,7 @@`
`1`	`1`	`from graphblas_algorithms import algorithms`
`2`	`2`	`from graphblas_algorithms.classes.digraph import to_graph`
`3`	`3`
	`4`	`+from .._utils import normalize_chunksize, partition`
`4`	`5`	`from ..exception import NetworkXUnbounded, NodeNotFound`
`5`	`6`
`6`	`7`	`__all__ = [`
`@@ -9,18 +10,14 @@`
`9`	`10`	`]`
`10`	`11`
`11`	`12`
`12`		`-def all_pairs_bellman_ford_path_length(G, weight="weight", *, chunksize="auto"):`
	`13`	`+def all_pairs_bellman_ford_path_length(G, weight="weight", *, chunksize="10 MiB"):`
`13`	`14`	`# Larger chunksize offers more parallelism, but uses more memory.`
`14`	`15`	`# Chunksize indicates for how many source nodes to compute at one time.`
`15`	`16`	`# The default is to choose the number of rows so the result, if dense,`
`16`	`17`	`# will be about 10MB.`
`17`	`18`	`G = to_graph(G, weight=weight)`
`18`		`- if chunksize == "auto":`
`19`		`- # TODO: make a utility function for this that can be reused`
`20`		`- targetsize = 10 * 1024 * 1024 # 10 MB`
`21`		`- chunksize = max(1, targetsize // (len(G) * G._A.dtype.np_type.itemsize))`
`22`		`-`
`23`		`- if chunksize is None or chunksize <= 0 or chunksize >= len(G):`
	`19`	`+ chunksize = normalize_chunksize(chunksize, len(G) * G._A.dtype.np_type.itemsize, len(G))`
	`20`	`+ if chunksize is None:`
`24`	`21`	`# All at once`
`25`	`22`	`try:`
`26`	`23`	`D = algorithms.bellman_ford_path_lengths(G)`
`@@ -35,12 +32,7 @@ def all_pairs_bellman_ford_path_length(G, weight="weight", *, chunksize="auto"):`
`35`	`32`	`raise NetworkXUnbounded(*e.args) from e`
`36`	`33`	`yield (source, G.vector_to_nodemap(d))`
`37`	`34`	`else:`
`38`		`- # We should probably make a utility function for chunking`
`39`		`- nodes = list(G)`
`40`		`- for start, stop in zip(`
`41`		`- range(0, len(nodes), chunksize), range(chunksize, len(nodes) + chunksize, chunksize)`
`42`		`- ):`
`43`		`- cur_nodes = nodes[start:stop]`
	`35`	`+ for cur_nodes in partition(chunksize, list(G)):`
`44`	`36`	`try:`
`45`	`37`	`D = algorithms.bellman_ford_path_lengths(G, cur_nodes)`
`46`	`38`	`except algorithms.exceptions.Unbounded as e:`

`‎graphblas_algorithms/nxapi/tests/test_utils.py`

Lines changed: 33 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,33 @@`
	`1`	`+import pytest`
	`2`	`+`
	`3`	`+from graphblas_algorithms.nxapi._utils import normalize_chunksize`
	`4`	`+`
	`5`	`+`
	`6`	`+def test_normalize_chunksize():`
	`7`	`+ assert normalize_chunksize(None) is None`
	`8`	`+ assert normalize_chunksize("all") is None`
	`9`	`+ assert normalize_chunksize("") is None`
	`10`	`+ assert normalize_chunksize(-1) is None`
	`11`	`+ assert normalize_chunksize("-1") is None`
	`12`	`+ assert normalize_chunksize(10, N=10) is None`
	`13`	`+ assert normalize_chunksize("1 MB", N=100) is None`
	`14`	`+ assert normalize_chunksize("1 chunk") is None`
	`15`	`+ assert normalize_chunksize("2 chunks", N=20) == 10`
	`16`	`+ assert normalize_chunksize(10) == 10`
	`17`	`+ assert normalize_chunksize(10.0) == 10`
	`18`	`+ assert normalize_chunksize("10") == 10`
	`19`	`+ assert normalize_chunksize("10.0") == 10`
	`20`	`+ assert normalize_chunksize("1_0 B") == 10`
	`21`	`+ assert normalize_chunksize("1e1") == 10`
	`22`	`+ assert normalize_chunksize("1e-2 kb") == 10`
	`23`	`+ assert normalize_chunksize("Mb") == 1000**2`
	`24`	`+ assert normalize_chunksize(" mb") == 1000**2`
	`25`	`+ assert normalize_chunksize("gib") == 1024**3`
	`26`	`+ with pytest.raises(TypeError, match="chunksize must be"):`
	`27`	`+ normalize_chunksize(object())`
	`28`	`+ with pytest.raises(ValueError, match="as a bytes"):`
	`29`	`+ normalize_chunksize("10 badbytes")`
	`30`	`+ with pytest.raises(ValueError, match="as a number"):`
	`31`	`+ normalize_chunksize("1bad0 TB")`
	`32`	`+ with pytest.raises(TypeError, match="N argument is required"):`
	`33`	`+ normalize_chunksize("10 chunks")`

`‎pyproject.toml`

Lines changed: 1 addition & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -199,6 +199,7 @@ ignore = [`
`199`	`199`	`"PLR0913", # Too many arguments to function call`
`200`	`200`	`"PLR0915", # Too many statements`
`201`	`201`	`"PLR2004", # Magic number used in comparison, consider replacing magic with a constant variable`
	`202`	`+ "PLW2901", # Outer for loop variable ... overwritten by inner assignment target (Note: good advice, but too strict)`
`202`	`203`	"RET502", # Do not implicitly `return None` in function able to return non-`None` value
`203`	`204`	"RET503", # Missing explicit `return` at the end of function able to return non-`None` value
`204`	`205`	"RET504", # Unnecessary variable assignment before `return` statement

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit 82965d5

File tree

6 files changed

6 files changed

`‎.pre-commit-config.yaml`

`‎graphblas_algorithms/nxapi/_utils.py`

`‎graphblas_algorithms/nxapi/cluster.py`

`‎graphblas_algorithms/nxapi/shortest_paths/weighted.py`

`‎graphblas_algorithms/nxapi/tests/test_utils.py`

`‎pyproject.toml`

0 commit comments