2020import sys
2121from collections import deque
2222from collections import Counter
23+ from collections import defaultdict
24+ import math , array
25+ from functools import lru_cache
2326
24- #Class to hold data. It's a dictionary of dictionaries.
25- #tree[key of the extent]= {range1: [snapshots],range2: [snapshots]}
27+ #function to convert a pair of positive integers to a single integer
28+ #we want to decrease memory consumption, thus we need this trick
29+ #http://szudzik.com/ElegantPairing.pdf
30+ #cache the results for speed up
31+ 32+ @lru_cache (maxsize = 32 )
33+ def unique_number (x ,y ):
34+ result = x
35+ if x >= y :
36+ result += y
37+ result += x ** 2
38+ else :
39+ result += y ** 2
40+ return result
41+ 42+ #undo the above function, return x,y based on a single number
43+ #also cache the results
44+ 45+ @lru_cache (maxsize = 131072 )
46+ def unique_to_pair (number ):
47+ root = int (math .floor (math .sqrt (number )))
48+ crit = number - root ** 2
49+ if crit < root :
50+ x = crit
51+ y = root
52+ else :
53+ x = root
54+ y = crit - root
55+ return x ,y
56+ 57+ #take a list of 'paired' numbers and return the x coordinate, which is snapshot
58+ #stored into the pair
59+ def return_snapshots (mylist ):
60+ result = []
61+ for item in mylist :
62+ snapshot ,_ = unique_to_pair (item )
63+ result .append (snapshot )
64+ return result
65+ 66+ #take a list of 'paired' numbers and return the paired number that has the same
67+ #x coordinate, which is the snapshot stored into the pair
68+ def return_coded (mylist ,snapshot ):
69+ for item in mylist :
70+ snapshot_pair ,_ = unique_to_pair (item )
71+ if snapshot_pair == snapshot :
72+ return item
73+ return None
2674
75+ #take a paired number and compare it with a snapshot
76+ #cache the results for speedup
77+ @lru_cache (maxsize = 131072 )
78+ def compare_pair_to_snapshot (item ,snapshot ):
79+ snapshot_pair ,_ = unique_to_pair (item )
80+ if snapshot_pair == snapshot :
81+ return True
82+ return False
83+ 84+ #Class to hold data. It's a dictionary of dictionaries.
85+ #tree[key of the extent]= {range1: [list of paired (snapshot,inode)],range2: [list of paired (snapshot,inode)]}
86+ #inodes data are used to find which files hold data of unique extents.
2787class TreeWrapper :
2888 def __init__ (self ):
2989 self ._tree = dict ()
3090 self ._snapshots = []
3191
3292 #unfortunately some extents reappear, maybe there are dedup or reflink?
3393 #right know they are completely ignored
34- def add (self ,tree ,key ,start ,stop ):
94+ 95+ #check if the current tree has data for this extent/key.
96+ #if it has, check if the current extent range is already parsed.
97+ 98+ #use array instead of list because integers consume too much memory in python
99+ def add (self ,tree ,key ,start ,stop ,inode ):
100+ mypair = unique_number (tree ,inode )
35101 if key in self ._tree .keys ():
36- #data_tree[key].update([datum.offset,stop])
37102 add = True
38103 ranges = sorted (self ._tree [key ].keys ())
39104 for limit in ranges :
40105 if limit > stop :
41106 break
42- snapshots = self ._tree [key ][limit ]
107+ #this code need to be reworked to cover when extents are adjucent
108+ #for the same snapshot
43109 if limit == start or limit == stop :
44- if tree in snapshots :
110+ #since snapshots are parsed linearly, check only if
111+ #the last data are from the same snapshot
112+ if compare_pair_to_snapshot (self ._tree [key ][limit ][- 1 ],tree ):
45113 add = False
46- #print(tree,key,start,stop)
47- #print(sorted(self._tree[key].items()))
48114 break
49115 if add :
50116 if start in self ._tree [key ].keys ():
51- self ._tree [key ][start ].append (tree )
117+ self ._tree [key ][start ].append (mypair )
52118 else :
53- self ._tree [key ][start ]= [tree ]
119+ self ._tree [key ][start ]= array .array ('Q' )
120+ self ._tree [key ][start ].append (mypair )
54121 if stop in self ._tree [key ].keys ():
55- self ._tree [key ][stop ].append (tree )
122+ self ._tree [key ][stop ].append (mypair )
56123 else :
57- self ._tree [key ][stop ]= [tree ]
124+ self ._tree [key ][stop ]= array .array ('Q' )
125+ self ._tree [key ][stop ].append (mypair )
58126 else :
59- #data_tree[key]=sortedcontainers.SortedSet([datum.offset,stop])
60- #self._tree[key]=sortedcontainers.SortedDict()
61127 self ._tree [key ]= dict ()
62- self ._tree [key ][start ]= [tree ]
63- self ._tree [key ][stop ]= [tree ]
128+ self ._tree [key ][start ]= array .array ('Q' )
129+ self ._tree [key ][start ].append (mypair )
130+ self ._tree [key ][stop ]= array .array ('Q' )
131+ self ._tree [key ][stop ].append (mypair )
64132
65- #each range marker should have only the snapshots that cover the upcoming range
133+ #this function analyzes the tree after all data are added.
134+ #for each range find which subvolumes use that range.
135+ #each snapshot has added its start and stop.
136+ #we keep the snapshots only in the start part.
137+ #scenario before: extent1: pos_1[tree1]..........pos_2[tree2]....pos_3[tree2]...pos_4[tree1]
138+ #final result: pos_1[tree1]..........pos_2[tree1,tree2]....pos_3[tree1]...pos_4[]
66139 def transform (self ):
67140 list_of_extents = sorted (self ._tree .keys ())
68141 i = 0
69142 while i < len (list_of_extents ):
70143 extent = list_of_extents [i ]
71144 rangedict = self ._tree [extent ]
72- #iterableview = rangedict.items()
73145 list_of_ranges = sorted (rangedict .keys ())
74146 for j ,myrange in enumerate (list_of_ranges ):
75147 if j == 0 :
76148 continue
77- #myrange,myset=mytuple
78- myset = set (rangedict [myrange ])
79- result = set (rangedict [list_of_ranges [j - 1 ]])^ myset
80- rangedict [myrange ]= list (result )
149+ #the upcoming extent is used by the snapshots symmetrical difference of set of snapshots
150+ myset = set (return_snapshots (rangedict [myrange ]))
151+ result = set (return_snapshots (rangedict [list_of_ranges [j - 1 ]]))^ myset
152+ #again store the ressult in array, not list.
153+ subvol_list = array .array ('Q' )
154+ for subvol in result :
155+ data = return_coded (rangedict [myrange ],subvol )
156+ if data == None :
157+ data = return_coded (rangedict [list_of_ranges [j - 1 ]],subvol )
158+ if data == None :
159+ print ("problem!" ,data ,subvol )
160+ subvol_list .append (data )
161+ rangedict [myrange ]= subvol_list
81162 self ._tree [extent ]= rangedict
82163 i += 1
83164
@@ -99,29 +180,40 @@ def __len__(self):
99180 return result
100181
101182 #find those ranges that have only one snapshot, if this snapshot is deleted
102- #this space will be freed
103- def find_unique (self ):
183+ #this space will be freed.
184+ #based on the scenario of transform is should return:
185+ #result[tree1]=pos2-pos1+pos4-pos3
186+ #result[tree2]=0
187+ #if files are analyzed use the inode data to find them ans store them in different dictionary.
188+ def find_unique (self ,fs ,analyze_file ):
104189 result = Counter ()
190+ result_data = defaultdict (set )
105191 for extent ,rangedict in self ._tree .items ():
106192 iterableview = sorted (rangedict .items ())
107193 for i ,mytuple in enumerate (iterableview ):
108- myrange ,myset = mytuple
194+ myrange ,unique_pair_list = mytuple
109195 #myset=list(myset)
110- if len (myset )== 1 :
111- try :
112- size = iterableview [i + 1 ][0 ]- myrange
113- result [myset [0 ]]+= size
114- except :
115- print (extent ,rangedict ,mytuple )
116- return result
117- 118- #helper function to find the size of the ranges that have the desired snapshots
196+ if len (unique_pair_list )== 1 :
197+ subvolume ,inode = unique_to_pair (unique_pair_list [0 ])
198+ size = iterableview [i + 1 ][0 ]- myrange
199+ result [subvolume ]+= size
200+ #result[myset[0]]+=size
201+ #print(inode)
202+ if analyze_file :
203+ try :
204+ file = btrfs .ioctl .ino_lookup (fs .fd ,subvolume ,inode )
205+ result_data [file .name_bytes .decode ('utf-8' )].add (subvolume )
206+ except :
207+ print ("Inode not found" ,inode )
208+ return result ,result_data
209+ 210+ #helper function to find the size of the extend ranges that have the desired snapshots
119211 def find_snapshots_size (self ,wanted ,not_wanted ):
120212 result = 0
121213 for extent ,rangedict in self ._tree .items ():
122214 rangelist = sorted (rangedict .keys ())
123215 for i ,myrange in enumerate (rangelist ):
124- snapshots = set (rangedict [myrange ])
216+ snapshots = set (return_snapshots ( rangedict [myrange ]) )
125217 if len (set (wanted ) & snapshots )> 0 and len (set (not_wanted ) & snapshots ) == 0 :
126218 try :
127219 result += rangelist [i + 1 ]- myrange
@@ -130,10 +222,12 @@ def find_snapshots_size(self,wanted,not_wanted):
130222 print (extent ,sorted (rangedict .items ()),myrange )
131223 return result
132224
225+ #the active subvolume must be the last one
133226 def add_snapshots (self ,snapshots ):
134227 self ._snapshots = snapshots .copy ()
135228
136229 #calculate the size of ranges ontop of the previous subvolume
230+ #older subvolumes must be first in subvolume list
137231 def find_snapshot_size_to_previous (self ):
138232 results = Counter ()
139233 for i , snapshot in enumerate (self ._snapshots ):
@@ -154,54 +248,84 @@ def find_snapshot_size_to_current(self):
154248 results [snapshot ]+= self .find_snapshots_size ([snapshot ],[current ])
155249 return results
156250
157- 251+ #main function to parse data from disk and add the to the tree of extents
158252def disk_parse (data_tree ,fs ,tree ):
159253 print ("Parsing subvolume:" ,tree )
160- for header , data in btrfs .ioctl .search_v2 (fs .fd , tree ):
254+ min_key = btrfs .ctree .Key (0 ,btrfs .ctree .EXTENT_DATA_KEY ,0 )
255+ for header , data in btrfs .ioctl .search_v2 (fs .fd , tree ,min_key ):
161256 if header .type == btrfs .ctree .EXTENT_DATA_KEY :
162257 datum = btrfs .ctree .FileExtentItem (header ,data )
163258 if datum .type != btrfs .ctree .FILE_EXTENT_INLINE :# and datum.disk_bytenr !=0:
164259 key = (datum .disk_bytenr ,datum .disk_num_bytes )
165260 stop = datum .offset + datum .num_bytes
166- data_tree .add (tree ,key ,datum .offset ,stop )
167- 261+ data_tree .add (tree ,key ,datum .offset ,stop , datum . key . objectid )
262+
168263
169264def main ():
170265 parser = argparse .ArgumentParser ()
171- parser .add_argument ("-u" ,"--unique" ,action = 'store_true' ,help = "calculate only unique data, -r makes no sense" )
266+ parser .add_argument ("-u" ,"--unique" ,action = 'store_true' ,help = "calculate only unique data, -r argument makes no sense if -u is active" )
267+ parser .add_argument ("-f" ,"--files" ,action = 'store_true' ,help = "find filenames that exist in unique extents" )
172268 parser .add_argument ("path" , type = str ,
173269 help = "path of the btrfs filesystem" )
174270 parser .add_argument ("-r" , "--root" , type = int ,default = 5 ,
175- help = "current active subvolume to analyze, default is 5" )
176- parser .add_argument ('subvolume' , nargs = '*' , type = int , help = 'Do not analyze these subvolumes' )
271+ help = "current active subvolume to analyze first, default is 5" )
272+ group = parser .add_mutually_exclusive_group ()
273+ group .add_argument ('-i' , '--ignore' , action = 'store_true' ,help = "Do not analyze the specified subvolumes" )
274+ group .add_argument ('-o' , '--only' , action = 'store_true' ,help = "Analyze only the specified subvolumes" )
275+ parser .add_argument ('subvolume' , nargs = '*' , type = int , help = 'Subvolumes to ingore or analyze' )
177276 args = parser .parse_args ()
178- 179- #list of ignored subvolumes
180- ignored_trees = set (args .subvolume )
181- ignored_trees .add (args .root )
277+ 278+ #find subvolumes to parse, make sure -r subvolume stays first
279+ parse_trees = [5 ]
280+ if args .root != 5 :
281+ parse_trees = [args .root ,5 ]
182282 fs = btrfs .FileSystem (args .path )
283+ for subvol in fs .subvolumes ():
284+ if subvol .key .objectid != args .root :
285+ parse_trees .append (subvol .key .objectid )
286+ 287+ #these are the subvolumes specified by the user, these will be either ignored
288+ #or all the other subvolumes will be ingored
289+ special_subvolumes = set (args .subvolume )
290+ 291+ #if no argument specified then assume that the user wanted to ingore the speficied subvolumes
292+ if args .ignore == False and args .only == False :
293+ args .ignore = True
294+ 295+ #remove the unneeded subvolumes
296+ if args .ignore :
297+ for item in special_subvolumes :
298+ try :
299+ parse_trees .remove (item )
300+ except :
301+ pass
302+ else :
303+ for tree in parse_trees [:]:
304+ if tree not in special_subvolumes :
305+ parse_trees .remove (tree )
183306
184- #data_tree=sortedcontainers.SortedDict()
185- #data_tree=dict()
186307 data_tree = TreeWrapper ()
187- #data_tree=TreeWrapperSql()
188- #data_tree=TreeWrapperCompress()
189- snapshots = []
190- 191- disk_parse (data_tree ,fs ,args .root )
192- snapshots .append (args .root )
193308
194- for subvol in fs .subvolumes ():
195- tree = subvol .key .objectid
196- if tree not in ignored_trees :
197- disk_parse (data_tree ,fs ,tree )
198- snapshots .append (tree )
199- changed_snapshots = deque (snapshots )
309+ #move the root subvolume in the end
310+ #older subvolumes must be first
311+ changed_snapshots = deque (parse_trees )
200312 changed_snapshots .rotate (- 1 )
201- data_tree .add_snapshots (list (changed_snapshots ))
313+ parse_trees = list (changed_snapshots )
314+ data_tree .add_snapshots (parse_trees )
315+ 316+ #parse the trees from newer to older
317+ parse_trees = list (reversed (parse_trees ))
318+ print ("Subvolumes to parse:" ,parse_trees )
319+ for tree in parse_trees :
320+ disk_parse (data_tree ,fs ,tree )
321+ 202322 data_tree .transform ()
323+ #print(unique_number.cache_info())
324+ #print(unique_to_pair.cache_info())
325+ #print(compare_pair_to_snapshot.cache_info())
203326 unique_sum = 0
204- unique_data = data_tree .find_unique ()
327+ unique_data ,files = data_tree .find_unique (fs ,args .files )
328+ #if unique analysis is only needed, do not calculate differences
205329 if args .unique :
206330 current_data = Counter ()
207331 previous_data = Counter ()
@@ -212,11 +336,16 @@ def main():
212336 print (" per subvolume of previous subvolume current(act) subvolume" )
213337 print ("---------------------|---------------------|----------------------" )
214338 print ("SubvolumId Size Size Size" )
215- for snapshot in reversed ( changed_snapshots ) :
339+ for snapshot in parse_trees :
216340 print ("{:>10} {:>10} {:>10} {:>10}" .format (snapshot ,btrfs .utils .pretty_size (unique_data [snapshot ]),btrfs .utils .pretty_size (previous_data [snapshot ]),btrfs .utils .pretty_size (current_data [snapshot ])))
341+ #print(files[snapshot])
217342 unique_sum += unique_data [snapshot ]
218343 print ("Size/Cost of subvolumes:" ,btrfs .utils .pretty_size (unique_sum ),"Volatility:" ,"{:.2%}" .format (unique_sum / len (data_tree )))
219- 344+ if args .files :
345+ print ()
346+ print ("Possible Unique Files:" )
347+ for file ,myset in files .items ():
348+ print (file ,":" ,myset )
220349
221350if __name__ == '__main__' :
222351 main ()
0 commit comments