Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 591d96b

Browse files
authored
Major rework to find files.
Find files which might contribute to unique extents. Add extra arguments.
1 parent e525b1f commit 591d96b

File tree

1 file changed

+193
-64
lines changed

1 file changed

+193
-64
lines changed

‎subvolume.py‎

Lines changed: 193 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -20,64 +20,145 @@
2020
import sys
2121
from collections import deque
2222
from collections import Counter
23+
from collections import defaultdict
24+
import math, array
25+
from functools import lru_cache
2326

24-
#Class to hold data. It's a dictionary of dictionaries.
25-
#tree[key of the extent]= {range1: [snapshots],range2: [snapshots]}
27+
#function to convert a pair of positive integers to a single integer
28+
#we want to decrease memory consumption, thus we need this trick
29+
#http://szudzik.com/ElegantPairing.pdf
30+
#cache the results for speed up
31+
32+
@lru_cache(maxsize=32)
33+
def unique_number(x,y):
34+
result=x
35+
if x >=y:
36+
result+=y
37+
result+=x**2
38+
else:
39+
result+=y**2
40+
return result
41+
42+
#undo the above function, return x,y based on a single number
43+
#also cache the results
44+
45+
@lru_cache(maxsize=131072)
46+
def unique_to_pair(number):
47+
root=int(math.floor(math.sqrt(number)))
48+
crit=number-root**2
49+
if crit< root:
50+
x=crit
51+
y=root
52+
else:
53+
x=root
54+
y=crit-root
55+
return x,y
56+
57+
#take a list of 'paired' numbers and return the x coordinate, which is snapshot
58+
#stored into the pair
59+
def return_snapshots(mylist):
60+
result=[]
61+
for item in mylist:
62+
snapshot,_=unique_to_pair(item)
63+
result.append(snapshot)
64+
return result
65+
66+
#take a list of 'paired' numbers and return the paired number that has the same
67+
#x coordinate, which is the snapshot stored into the pair
68+
def return_coded(mylist,snapshot):
69+
for item in mylist:
70+
snapshot_pair,_=unique_to_pair(item)
71+
if snapshot_pair == snapshot:
72+
return item
73+
return None
2674

75+
#take a paired number and compare it with a snapshot
76+
#cache the results for speedup
77+
@lru_cache(maxsize=131072)
78+
def compare_pair_to_snapshot(item,snapshot):
79+
snapshot_pair,_=unique_to_pair(item)
80+
if snapshot_pair == snapshot:
81+
return True
82+
return False
83+
84+
#Class to hold data. It's a dictionary of dictionaries.
85+
#tree[key of the extent]= {range1: [list of paired (snapshot,inode)],range2: [list of paired (snapshot,inode)]}
86+
#inodes data are used to find which files hold data of unique extents.
2787
class TreeWrapper:
2888
def __init__(self):
2989
self._tree=dict()
3090
self._snapshots=[]
3191

3292
#unfortunately some extents reappear, maybe there are dedup or reflink?
3393
#right know they are completely ignored
34-
def add(self,tree,key,start,stop):
94+
95+
#check if the current tree has data for this extent/key.
96+
#if it has, check if the current extent range is already parsed.
97+
98+
#use array instead of list because integers consume too much memory in python
99+
def add(self,tree,key,start,stop,inode):
100+
mypair=unique_number(tree,inode)
35101
if key in self._tree.keys():
36-
#data_tree[key].update([datum.offset,stop])
37102
add=True
38103
ranges=sorted(self._tree[key].keys())
39104
for limit in ranges:
40105
if limit > stop:
41106
break
42-
snapshots=self._tree[key][limit]
107+
#this code need to be reworked to cover when extents are adjucent
108+
#for the same snapshot
43109
if limit == start or limit == stop:
44-
if tree in snapshots:
110+
#since snapshots are parsed linearly, check only if
111+
#the last data are from the same snapshot
112+
if compare_pair_to_snapshot(self._tree[key][limit][-1],tree):
45113
add=False
46-
#print(tree,key,start,stop)
47-
#print(sorted(self._tree[key].items()))
48114
break
49115
if add:
50116
if start in self._tree[key].keys():
51-
self._tree[key][start].append(tree)
117+
self._tree[key][start].append(mypair)
52118
else:
53-
self._tree[key][start]=[tree]
119+
self._tree[key][start]=array.array('Q')
120+
self._tree[key][start].append(mypair)
54121
if stop in self._tree[key].keys():
55-
self._tree[key][stop].append(tree)
122+
self._tree[key][stop].append(mypair)
56123
else:
57-
self._tree[key][stop]=[tree]
124+
self._tree[key][stop]=array.array('Q')
125+
self._tree[key][stop].append(mypair)
58126
else:
59-
#data_tree[key]=sortedcontainers.SortedSet([datum.offset,stop])
60-
#self._tree[key]=sortedcontainers.SortedDict()
61127
self._tree[key]=dict()
62-
self._tree[key][start]=[tree]
63-
self._tree[key][stop]=[tree]
128+
self._tree[key][start]=array.array('Q')
129+
self._tree[key][start].append(mypair)
130+
self._tree[key][stop]=array.array('Q')
131+
self._tree[key][stop].append(mypair)
64132

65-
#each range marker should have only the snapshots that cover the upcoming range
133+
#this function analyzes the tree after all data are added.
134+
#for each range find which subvolumes use that range.
135+
#each snapshot has added its start and stop.
136+
#we keep the snapshots only in the start part.
137+
#scenario before: extent1: pos_1[tree1]..........pos_2[tree2]....pos_3[tree2]...pos_4[tree1]
138+
#final result: pos_1[tree1]..........pos_2[tree1,tree2]....pos_3[tree1]...pos_4[]
66139
def transform(self):
67140
list_of_extents=sorted(self._tree.keys())
68141
i=0
69142
while i < len(list_of_extents):
70143
extent=list_of_extents[i]
71144
rangedict=self._tree[extent]
72-
#iterableview = rangedict.items()
73145
list_of_ranges=sorted(rangedict.keys())
74146
for j,myrange in enumerate(list_of_ranges):
75147
if j ==0:
76148
continue
77-
#myrange,myset=mytuple
78-
myset=set(rangedict[myrange])
79-
result = set(rangedict[list_of_ranges[j-1]])^myset
80-
rangedict[myrange]=list(result)
149+
#the upcoming extent is used by the snapshots symmetrical difference of set of snapshots
150+
myset=set(return_snapshots(rangedict[myrange]))
151+
result = set(return_snapshots(rangedict[list_of_ranges[j-1]]))^myset
152+
#again store the ressult in array, not list.
153+
subvol_list=array.array('Q')
154+
for subvol in result:
155+
data= return_coded(rangedict[myrange],subvol)
156+
if data ==None:
157+
data=return_coded(rangedict[list_of_ranges[j-1]],subvol)
158+
if data ==None:
159+
print("problem!",data,subvol)
160+
subvol_list.append(data)
161+
rangedict[myrange]=subvol_list
81162
self._tree[extent]=rangedict
82163
i+=1
83164

@@ -99,29 +180,40 @@ def __len__(self):
99180
return result
100181

101182
#find those ranges that have only one snapshot, if this snapshot is deleted
102-
#this space will be freed
103-
def find_unique(self):
183+
#this space will be freed.
184+
#based on the scenario of transform is should return:
185+
#result[tree1]=pos2-pos1+pos4-pos3
186+
#result[tree2]=0
187+
#if files are analyzed use the inode data to find them ans store them in different dictionary.
188+
def find_unique(self,fs,analyze_file):
104189
result=Counter()
190+
result_data=defaultdict(set)
105191
for extent,rangedict in self._tree.items():
106192
iterableview = sorted(rangedict.items())
107193
for i,mytuple in enumerate(iterableview):
108-
myrange,myset=mytuple
194+
myrange,unique_pair_list=mytuple
109195
#myset=list(myset)
110-
if len(myset)==1:
111-
try:
112-
size=iterableview[i+1][0]-myrange
113-
result[myset[0]]+=size
114-
except:
115-
print(extent,rangedict,mytuple)
116-
return result
117-
118-
#helper function to find the size of the ranges that have the desired snapshots
196+
if len(unique_pair_list)==1:
197+
subvolume,inode=unique_to_pair(unique_pair_list[0])
198+
size=iterableview[i+1][0]-myrange
199+
result[subvolume]+=size
200+
#result[myset[0]]+=size
201+
#print(inode)
202+
if analyze_file:
203+
try:
204+
file=btrfs.ioctl.ino_lookup(fs.fd,subvolume,inode)
205+
result_data[file.name_bytes.decode('utf-8')].add(subvolume)
206+
except:
207+
print("Inode not found",inode)
208+
return result,result_data
209+
210+
#helper function to find the size of the extend ranges that have the desired snapshots
119211
def find_snapshots_size(self,wanted,not_wanted):
120212
result=0
121213
for extent,rangedict in self._tree.items():
122214
rangelist = sorted(rangedict.keys())
123215
for i,myrange in enumerate(rangelist):
124-
snapshots=set(rangedict[myrange])
216+
snapshots=set(return_snapshots(rangedict[myrange]))
125217
if len(set(wanted) & snapshots)>0 and len(set(not_wanted) & snapshots) ==0:
126218
try:
127219
result+=rangelist[i+1]-myrange
@@ -130,10 +222,12 @@ def find_snapshots_size(self,wanted,not_wanted):
130222
print(extent,sorted(rangedict.items()),myrange)
131223
return result
132224

225+
#the active subvolume must be the last one
133226
def add_snapshots(self,snapshots):
134227
self._snapshots=snapshots.copy()
135228

136229
#calculate the size of ranges ontop of the previous subvolume
230+
#older subvolumes must be first in subvolume list
137231
def find_snapshot_size_to_previous(self):
138232
results=Counter()
139233
for i, snapshot in enumerate(self._snapshots):
@@ -154,54 +248,84 @@ def find_snapshot_size_to_current(self):
154248
results[snapshot]+=self.find_snapshots_size([snapshot],[current])
155249
return results
156250

157-
251+
#main function to parse data from disk and add the to the tree of extents
158252
def disk_parse(data_tree,fs,tree):
159253
print("Parsing subvolume:",tree)
160-
for header, data in btrfs.ioctl.search_v2(fs.fd, tree):
254+
min_key=btrfs.ctree.Key(0,btrfs.ctree.EXTENT_DATA_KEY,0)
255+
for header, data in btrfs.ioctl.search_v2(fs.fd, tree,min_key):
161256
if header.type == btrfs.ctree.EXTENT_DATA_KEY:
162257
datum=btrfs.ctree.FileExtentItem(header,data)
163258
if datum.type != btrfs.ctree.FILE_EXTENT_INLINE:# and datum.disk_bytenr !=0:
164259
key=(datum.disk_bytenr,datum.disk_num_bytes)
165260
stop=datum.offset+datum.num_bytes
166-
data_tree.add(tree,key,datum.offset,stop)
167-
261+
data_tree.add(tree,key,datum.offset,stop,datum.key.objectid)
262+
168263

169264
def main():
170265
parser = argparse.ArgumentParser()
171-
parser.add_argument("-u","--unique",action='store_true',help="calculate only unique data, -r makes no sense")
266+
parser.add_argument("-u","--unique",action='store_true',help="calculate only unique data, -r argument makes no sense if -u is active")
267+
parser.add_argument("-f","--files",action='store_true',help="find filenames that exist in unique extents")
172268
parser.add_argument("path", type=str,
173269
help="path of the btrfs filesystem")
174270
parser.add_argument("-r", "--root", type=int,default=5,
175-
help="current active subvolume to analyze, default is 5")
176-
parser.add_argument('subvolume', nargs='*', type=int, help='Do not analyze these subvolumes')
271+
help="current active subvolume to analyze first, default is 5")
272+
group = parser.add_mutually_exclusive_group()
273+
group.add_argument('-i', '--ignore', action='store_true',help="Do not analyze the specified subvolumes")
274+
group.add_argument('-o', '--only', action='store_true',help="Analyze only the specified subvolumes")
275+
parser.add_argument('subvolume', nargs='*', type=int, help='Subvolumes to ingore or analyze')
177276
args=parser.parse_args()
178-
179-
#list of ignored subvolumes
180-
ignored_trees=set(args.subvolume)
181-
ignored_trees.add(args.root)
277+
278+
#find subvolumes to parse, make sure -r subvolume stays first
279+
parse_trees=[5]
280+
if args.root!=5:
281+
parse_trees=[args.root,5]
182282
fs = btrfs.FileSystem(args.path)
283+
for subvol in fs.subvolumes():
284+
if subvol.key.objectid != args.root:
285+
parse_trees.append(subvol.key.objectid)
286+
287+
#these are the subvolumes specified by the user, these will be either ignored
288+
#or all the other subvolumes will be ingored
289+
special_subvolumes=set(args.subvolume)
290+
291+
#if no argument specified then assume that the user wanted to ingore the speficied subvolumes
292+
if args.ignore == False and args.only== False:
293+
args.ignore=True
294+
295+
#remove the unneeded subvolumes
296+
if args.ignore:
297+
for item in special_subvolumes:
298+
try:
299+
parse_trees.remove(item)
300+
except:
301+
pass
302+
else:
303+
for tree in parse_trees[:]:
304+
if tree not in special_subvolumes:
305+
parse_trees.remove(tree)
183306

184-
#data_tree=sortedcontainers.SortedDict()
185-
#data_tree=dict()
186307
data_tree=TreeWrapper()
187-
#data_tree=TreeWrapperSql()
188-
#data_tree=TreeWrapperCompress()
189-
snapshots=[]
190-
191-
disk_parse(data_tree,fs,args.root)
192-
snapshots.append(args.root)
193308

194-
for subvol in fs.subvolumes():
195-
tree = subvol.key.objectid
196-
if tree not in ignored_trees:
197-
disk_parse(data_tree,fs,tree)
198-
snapshots.append(tree)
199-
changed_snapshots = deque(snapshots)
309+
#move the root subvolume in the end
310+
#older subvolumes must be first
311+
changed_snapshots = deque(parse_trees)
200312
changed_snapshots.rotate(-1)
201-
data_tree.add_snapshots(list(changed_snapshots))
313+
parse_trees=list(changed_snapshots)
314+
data_tree.add_snapshots(parse_trees)
315+
316+
#parse the trees from newer to older
317+
parse_trees=list(reversed(parse_trees))
318+
print("Subvolumes to parse:",parse_trees)
319+
for tree in parse_trees:
320+
disk_parse(data_tree,fs,tree)
321+
202322
data_tree.transform()
323+
#print(unique_number.cache_info())
324+
#print(unique_to_pair.cache_info())
325+
#print(compare_pair_to_snapshot.cache_info())
203326
unique_sum=0
204-
unique_data=data_tree.find_unique()
327+
unique_data,files=data_tree.find_unique(fs,args.files)
328+
#if unique analysis is only needed, do not calculate differences
205329
if args.unique:
206330
current_data=Counter()
207331
previous_data=Counter()
@@ -212,11 +336,16 @@ def main():
212336
print(" per subvolume of previous subvolume current(act) subvolume")
213337
print("---------------------|---------------------|----------------------")
214338
print("SubvolumId Size Size Size")
215-
for snapshot in reversed(changed_snapshots):
339+
for snapshot in parse_trees:
216340
print("{:>10} {:>10} {:>10} {:>10}".format(snapshot,btrfs.utils.pretty_size(unique_data[snapshot]),btrfs.utils.pretty_size(previous_data[snapshot]),btrfs.utils.pretty_size(current_data[snapshot])))
341+
#print(files[snapshot])
217342
unique_sum+=unique_data[snapshot]
218343
print("Size/Cost of subvolumes:",btrfs.utils.pretty_size(unique_sum),"Volatility:","{:.2%}".format(unique_sum/len(data_tree)))
219-
344+
if args.files:
345+
print()
346+
print("Possible Unique Files:")
347+
for file,myset in files.items():
348+
print(file,":",myset)
220349

221350
if __name__ == '__main__':
222351
main()

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /