Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 82965d5

Browse files
authored
Add normalize_chunksize and partition utility functions (#47)
1 parent edf4f06 commit 82965d5

File tree

6 files changed

+181
-30
lines changed

6 files changed

+181
-30
lines changed

‎.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ repos:
7171
additional_dependencies: [tomli]
7272
files: ^(graphblas_algorithms|docs)/
7373
- repo: https://github.com/charliermarsh/ruff-pre-commit
74-
rev: v0.0.249
74+
rev: v0.0.252
7575
hooks:
7676
- id: ruff
7777
- repo: https://github.com/pre-commit/pre-commit-hooks

‎graphblas_algorithms/nxapi/_utils.py

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
from math import ceil
2+
from numbers import Number
3+
4+
try:
5+
from itertools import pairwise # Added in Python 3.10
6+
except ImportError:
7+
8+
def pairwise(it):
9+
it = iter(it)
10+
for prev in it:
11+
for cur in it:
12+
yield (prev, cur)
13+
prev = cur
14+
15+
16+
BYTES_UNITS = {
17+
"": 1,
18+
"b": 1,
19+
"kb": 1000,
20+
"mb": 1000**2,
21+
"gb": 1000**3,
22+
"tb": 1000**4,
23+
"pb": 1000**5,
24+
"eb": 1000**6,
25+
"zb": 1000**7,
26+
"kib": 1024,
27+
"mib": 1024**2,
28+
"gib": 1024**3,
29+
"tib": 1024**4,
30+
"pib": 1024**5,
31+
"eib": 1024**6,
32+
"zib": 1024**7,
33+
}
34+
35+
36+
def normalize_chunksize(chunksize, itemsize=1, N=None):
37+
if chunksize is None:
38+
return None
39+
if isinstance(chunksize, Number):
40+
rv = int(chunksize)
41+
if rv <= 0 or N is not None and rv >= N:
42+
return None
43+
return rv
44+
if not isinstance(chunksize, str):
45+
raise TypeError(f"chunksize must be a number or a string; got {type(chunksize)}")
46+
chunkstring = chunksize.replace(" ", "").replace("_", "").lower()
47+
if not chunkstring or chunkstring == "all":
48+
return None
49+
for i, c in enumerate(reversed(chunkstring)):
50+
if c.isdigit():
51+
index = len(chunkstring) - i
52+
break
53+
else:
54+
chunkstring = f"1{chunkstring}"
55+
index = 1
56+
57+
prefix = chunkstring[:index]
58+
suffix = chunkstring[index:]
59+
60+
try:
61+
number = float(prefix)
62+
except ValueError as exc:
63+
raise ValueError(
64+
f"Bad chunksize: {chunksize!r}. Could not interpret {prefix!r} as a number."
65+
) from exc
66+
67+
if suffix in {"chunk", "chunks"}:
68+
if number <= 1:
69+
return None
70+
if N is None:
71+
raise TypeError(
72+
f"N argument is required to determine chunksize to split into {int(number)} chunks"
73+
)
74+
rv = ceil(N / number)
75+
else:
76+
scale = BYTES_UNITS.get(suffix)
77+
if scale is None:
78+
raise ValueError(
79+
f"Bad chunksize: {chunksize!r}. Could not interpret {suffix!r} as a bytes unit."
80+
)
81+
number *= scale
82+
if chunkstring[-1] == "b":
83+
number = max(1, number / itemsize)
84+
rv = int(round(number))
85+
if rv <= 0 or N is not None and rv >= N:
86+
return None
87+
return rv
88+
89+
90+
def partition(chunksize, L, *, evenly=True):
91+
"""Partition a list into chunks"""
92+
N = len(L)
93+
if N == 0:
94+
return
95+
chunksize = int(chunksize)
96+
if chunksize <= 0 or chunksize >= N:
97+
yield L
98+
return
99+
if chunksize == 1:
100+
yield from L
101+
return
102+
if evenly:
103+
k = ceil(L / chunksize)
104+
if k * chunksize != N:
105+
yield from split_evenly(k, L)
106+
return
107+
for start, stop in pairwise(range(0, N + chunksize, chunksize)):
108+
yield L[start:stop]
109+
110+
111+
def split_evenly(k, L):
112+
"""Split a list into approximately-equal parts"""
113+
N = len(L)
114+
if N == 0:
115+
return
116+
k = int(k)
117+
if k <= 1:
118+
yield L
119+
return
120+
start = 0
121+
for i in range(1, k):
122+
stop = (N * i + k - 1) // k
123+
if stop != start:
124+
yield L[start:stop]
125+
start = stop
126+
if stop != N:
127+
yield L[stop:]

‎graphblas_algorithms/nxapi/cluster.py

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
from graphblas_algorithms.classes.graph import to_undirected_graph
66
from graphblas_algorithms.utils import not_implemented_for
77

8+
from ._utils import normalize_chunksize, partition
9+
810
__all__ = [
911
"triangles",
1012
"transitivity",
@@ -90,11 +92,11 @@ def _split(L, k):
9092

9193

9294
# TODO: should this move into algorithms?
93-
def _square_clustering_split(G, node_ids=None, *, nsplits):
95+
def _square_clustering_split(G, node_ids=None, *, chunksize):
9496
if node_ids is None:
9597
node_ids, _ = G._A.reduce_rowwise(monoid.any).to_coo(values=False)
9698
result = None
97-
for chunk_ids in _split(node_ids, nsplits):
99+
for chunk_ids in partition(chunksize, node_ids):
98100
res = algorithms.square_clustering(G, chunk_ids)
99101
if result is None:
100102
result = res
@@ -103,36 +105,32 @@ def _square_clustering_split(G, node_ids=None, *, nsplits):
103105
return result
104106

105107

106-
def square_clustering(G, nodes=None, *, nsplits="auto"):
107-
# `nsplits` is used to split the computation into chunks.
108+
def square_clustering(G, nodes=None, *, chunksize="256 MiB"):
109+
# `chunksize` is used to split the computation into chunks.
108110
# square_clustering computes `A @ A`, which can get very large, even dense.
109-
# The default `nsplits` is to choose the number so that `Asubset @ A`
111+
# The default `chunksize` is to choose the number so that `Asubset @ A`
110112
# will be about 256 MB if dense.
111113
G = to_undirected_graph(G)
112114
if len(G) == 0:
113115
return {}
114-
if nsplits == "auto":
115-
# TODO: make a utility function for this that can be reused
116-
# Also, should we use `chunksize` instead of `nsplits`?
117-
targetsize = 256 * 1024 * 1024 # 256 MB
118-
nsplits = len(G) ** 2 * G._A.dtype.np_type.itemsize // targetsize
119-
if nsplits <= 1:
120-
nsplits = None
116+
117+
chunksize = normalize_chunksize(chunksize, len(G) * G._A.dtype.np_type.itemsize, len(G))
118+
121119
if nodes is None:
122120
# Should we use this one for subsets of nodes as well?
123-
if nsplits is None:
121+
if chunksize is None:
124122
result = algorithms.square_clustering(G)
125123
else:
126-
result = _square_clustering_split(G, nsplits=nsplits)
124+
result = _square_clustering_split(G, chunksize=chunksize)
127125
return G.vector_to_nodemap(result, fill_value=0)
128126
if nodes in G:
129127
idx = G._key_to_id[nodes]
130128
return algorithms.single_square_clustering(G, idx)
131129
ids = G.list_to_ids(nodes)
132-
if nsplits is None:
130+
if chunksize is None:
133131
result = algorithms.square_clustering(G, ids)
134132
else:
135-
result = _square_clustering_split(G, ids, nsplits=nsplits)
133+
result = _square_clustering_split(G, ids, chunksize=chunksize)
136134
return G.vector_to_nodemap(result)
137135

138136

‎graphblas_algorithms/nxapi/shortest_paths/weighted.py

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from graphblas_algorithms import algorithms
22
from graphblas_algorithms.classes.digraph import to_graph
33

4+
from .._utils import normalize_chunksize, partition
45
from ..exception import NetworkXUnbounded, NodeNotFound
56

67
__all__ = [
@@ -9,18 +10,14 @@
910
]
1011

1112

12-
def all_pairs_bellman_ford_path_length(G, weight="weight", *, chunksize="auto"):
13+
def all_pairs_bellman_ford_path_length(G, weight="weight", *, chunksize="10 MiB"):
1314
# Larger chunksize offers more parallelism, but uses more memory.
1415
# Chunksize indicates for how many source nodes to compute at one time.
1516
# The default is to choose the number of rows so the result, if dense,
1617
# will be about 10MB.
1718
G = to_graph(G, weight=weight)
18-
if chunksize == "auto":
19-
# TODO: make a utility function for this that can be reused
20-
targetsize = 10 * 1024 * 1024 # 10 MB
21-
chunksize = max(1, targetsize // (len(G) * G._A.dtype.np_type.itemsize))
22-
23-
if chunksize is None or chunksize <= 0 or chunksize >= len(G):
19+
chunksize = normalize_chunksize(chunksize, len(G) * G._A.dtype.np_type.itemsize, len(G))
20+
if chunksize is None:
2421
# All at once
2522
try:
2623
D = algorithms.bellman_ford_path_lengths(G)
@@ -35,12 +32,7 @@ def all_pairs_bellman_ford_path_length(G, weight="weight", *, chunksize="auto"):
3532
raise NetworkXUnbounded(*e.args) from e
3633
yield (source, G.vector_to_nodemap(d))
3734
else:
38-
# We should probably make a utility function for chunking
39-
nodes = list(G)
40-
for start, stop in zip(
41-
range(0, len(nodes), chunksize), range(chunksize, len(nodes) + chunksize, chunksize)
42-
):
43-
cur_nodes = nodes[start:stop]
35+
for cur_nodes in partition(chunksize, list(G)):
4436
try:
4537
D = algorithms.bellman_ford_path_lengths(G, cur_nodes)
4638
except algorithms.exceptions.Unbounded as e:
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import pytest
2+
3+
from graphblas_algorithms.nxapi._utils import normalize_chunksize
4+
5+
6+
def test_normalize_chunksize():
7+
assert normalize_chunksize(None) is None
8+
assert normalize_chunksize("all") is None
9+
assert normalize_chunksize("") is None
10+
assert normalize_chunksize(-1) is None
11+
assert normalize_chunksize("-1") is None
12+
assert normalize_chunksize(10, N=10) is None
13+
assert normalize_chunksize("1 MB", N=100) is None
14+
assert normalize_chunksize("1 chunk") is None
15+
assert normalize_chunksize("2 chunks", N=20) == 10
16+
assert normalize_chunksize(10) == 10
17+
assert normalize_chunksize(10.0) == 10
18+
assert normalize_chunksize("10") == 10
19+
assert normalize_chunksize("10.0") == 10
20+
assert normalize_chunksize("1_0 B") == 10
21+
assert normalize_chunksize("1e1") == 10
22+
assert normalize_chunksize("1e-2 kb") == 10
23+
assert normalize_chunksize("Mb") == 1000**2
24+
assert normalize_chunksize(" mb") == 1000**2
25+
assert normalize_chunksize("gib") == 1024**3
26+
with pytest.raises(TypeError, match="chunksize must be"):
27+
normalize_chunksize(object())
28+
with pytest.raises(ValueError, match="as a bytes"):
29+
normalize_chunksize("10 badbytes")
30+
with pytest.raises(ValueError, match="as a number"):
31+
normalize_chunksize("1bad0 TB")
32+
with pytest.raises(TypeError, match="N argument is required"):
33+
normalize_chunksize("10 chunks")

‎pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,7 @@ ignore = [
199199
"PLR0913", # Too many arguments to function call
200200
"PLR0915", # Too many statements
201201
"PLR2004", # Magic number used in comparison, consider replacing magic with a constant variable
202+
"PLW2901", # Outer for loop variable ... overwritten by inner assignment target (Note: good advice, but too strict)
202203
"RET502", # Do not implicitly `return None` in function able to return non-`None` value
203204
"RET503", # Missing explicit `return` at the end of function able to return non-`None` value
204205
"RET504", # Unnecessary variable assignment before `return` statement

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /