Commit 591d96b

authored

Major rework to find files.

Find files which might contribute to unique extents. Add extra arguments.

1 parent e525b1f commit 591d96bCopy full SHA for 591d96b

File tree

1 file changed

+193

-64

lines changed

subvolume.py

1 file changed

+193

-64

lines changed

`‎subvolume.py‎`

Lines changed: 193 additions & 64 deletions

Original file line number	Diff line number	Diff line change
`@@ -20,64 +20,145 @@`
`20`	`20`	`import sys`
`21`	`21`	`from collections import deque`
`22`	`22`	`from collections import Counter`
	`23`	`+from collections import defaultdict`
	`24`	`+import math, array`
	`25`	`+from functools import lru_cache`
`23`	`26`
`24`		`-#Class to hold data. It's a dictionary of dictionaries.`
`25`		`-#tree[key of the extent]= {range1: [snapshots],range2: [snapshots]}`
	`27`	`+#function to convert a pair of positive integers to a single integer`
	`28`	`+#we want to decrease memory consumption, thus we need this trick`
	`29`	`+#http://szudzik.com/ElegantPairing.pdf`
	`30`	`+#cache the results for speed up`
	`31`	`+`
	`32`	`+@lru_cache(maxsize=32)`
	`33`	`+def unique_number(x,y):`
	`34`	`+ result=x`
	`35`	`+ if x >=y:`
	`36`	`+ result+=y`
	`37`	`+ result+=x**2`
	`38`	`+ else:`
	`39`	`+ result+=y**2`
	`40`	`+ return result`
	`41`	`+`
	`42`	`+#undo the above function, return x,y based on a single number`
	`43`	`+#also cache the results`
	`44`	`+`
	`45`	`+@lru_cache(maxsize=131072)`
	`46`	`+def unique_to_pair(number):`
	`47`	`+ root=int(math.floor(math.sqrt(number)))`
	`48`	`+ crit=number-root**2`
	`49`	`+ if crit< root:`
	`50`	`+ x=crit`
	`51`	`+ y=root`
	`52`	`+ else:`
	`53`	`+ x=root`
	`54`	`+ y=crit-root`
	`55`	`+ return x,y`
	`56`	`+`
	`57`	`+#take a list of 'paired' numbers and return the x coordinate, which is snapshot`
	`58`	`+#stored into the pair`
	`59`	`+def return_snapshots(mylist):`
	`60`	`+ result=[]`
	`61`	`+ for item in mylist:`
	`62`	`+ snapshot,_=unique_to_pair(item)`
	`63`	`+ result.append(snapshot)`
	`64`	`+ return result`
	`65`	`+`
	`66`	`+#take a list of 'paired' numbers and return the paired number that has the same`
	`67`	`+#x coordinate, which is the snapshot stored into the pair`
	`68`	`+def return_coded(mylist,snapshot):`
	`69`	`+ for item in mylist:`
	`70`	`+ snapshot_pair,_=unique_to_pair(item)`
	`71`	`+ if snapshot_pair == snapshot:`
	`72`	`+ return item`
	`73`	`+ return None`
`26`	`74`
	`75`	`+#take a paired number and compare it with a snapshot`
	`76`	`+#cache the results for speedup`
	`77`	`+@lru_cache(maxsize=131072)`
	`78`	`+def compare_pair_to_snapshot(item,snapshot):`
	`79`	`+ snapshot_pair,_=unique_to_pair(item)`
	`80`	`+ if snapshot_pair == snapshot:`
	`81`	`+ return True`
	`82`	`+ return False`
	`83`	`+`
	`84`	`+#Class to hold data. It's a dictionary of dictionaries.`
	`85`	`+#tree[key of the extent]= {range1: [list of paired (snapshot,inode)],range2: [list of paired (snapshot,inode)]}`
	`86`	`+#inodes data are used to find which files hold data of unique extents.`
`27`	`87`	`class TreeWrapper:`
`28`	`88`	`def __init__(self):`
`29`	`89`	`self._tree=dict()`
`30`	`90`	`self._snapshots=[]`
`31`	`91`
`32`	`92`	`#unfortunately some extents reappear, maybe there are dedup or reflink?`
`33`	`93`	`#right know they are completely ignored`
`34`		`- def add(self,tree,key,start,stop):`
	`94`	`+`
	`95`	`+ #check if the current tree has data for this extent/key.`
	`96`	`+ #if it has, check if the current extent range is already parsed.`
	`97`	`+`
	`98`	`+ #use array instead of list because integers consume too much memory in python`
	`99`	`+ def add(self,tree,key,start,stop,inode):`
	`100`	`+ mypair=unique_number(tree,inode)`
`35`	`101`	`if key in self._tree.keys():`
`36`		`- #data_tree[key].update([datum.offset,stop])`
`37`	`102`	`add=True`
`38`	`103`	`ranges=sorted(self._tree[key].keys())`
`39`	`104`	`for limit in ranges:`
`40`	`105`	`if limit > stop:`
`41`	`106`	`break`
`42`		`- snapshots=self._tree[key][limit]`
	`107`	`+ #this code need to be reworked to cover when extents are adjucent`
	`108`	`+ #for the same snapshot`
`43`	`109`	`if limit == start or limit == stop:`
`44`		`- if tree in snapshots:`
	`110`	`+ #since snapshots are parsed linearly, check only if`
	`111`	`+ #the last data are from the same snapshot`
	`112`	`+ if compare_pair_to_snapshot(self._tree[key][limit][-1],tree):`
`45`	`113`	`add=False`
`46`		`- #print(tree,key,start,stop)`
`47`		`- #print(sorted(self._tree[key].items()))`
`48`	`114`	`break`
`49`	`115`	`if add:`
`50`	`116`	`if start in self._tree[key].keys():`
`51`		`- self._tree[key][start].append(tree)`
	`117`	`+ self._tree[key][start].append(mypair)`
`52`	`118`	`else:`
`53`		`- self._tree[key][start]=[tree]`
	`119`	`+ self._tree[key][start]=array.array('Q')`
	`120`	`+ self._tree[key][start].append(mypair)`
`54`	`121`	`if stop in self._tree[key].keys():`
`55`		`- self._tree[key][stop].append(tree)`
	`122`	`+ self._tree[key][stop].append(mypair)`
`56`	`123`	`else:`
`57`		`- self._tree[key][stop]=[tree]`
	`124`	`+ self._tree[key][stop]=array.array('Q')`
	`125`	`+ self._tree[key][stop].append(mypair)`
`58`	`126`	`else:`
`59`		`- #data_tree[key]=sortedcontainers.SortedSet([datum.offset,stop])`
`60`		`- #self._tree[key]=sortedcontainers.SortedDict()`
`61`	`127`	`self._tree[key]=dict()`
`62`		`- self._tree[key][start]=[tree]`
`63`		`- self._tree[key][stop]=[tree]`
	`128`	`+ self._tree[key][start]=array.array('Q')`
	`129`	`+ self._tree[key][start].append(mypair)`
	`130`	`+ self._tree[key][stop]=array.array('Q')`
	`131`	`+ self._tree[key][stop].append(mypair)`
`64`	`132`
`65`		`- #each range marker should have only the snapshots that cover the upcoming range`
	`133`	`+ #this function analyzes the tree after all data are added.`
	`134`	`+ #for each range find which subvolumes use that range.`
	`135`	`+ #each snapshot has added its start and stop.`
	`136`	`+ #we keep the snapshots only in the start part.`
	`137`	`+ #scenario before: extent1: pos_1[tree1]..........pos_2[tree2]....pos_3[tree2]...pos_4[tree1]`
	`138`	`+ #final result: pos_1[tree1]..........pos_2[tree1,tree2]....pos_3[tree1]...pos_4[]`
`66`	`139`	`def transform(self):`
`67`	`140`	`list_of_extents=sorted(self._tree.keys())`
`68`	`141`	`i=0`
`69`	`142`	`while i < len(list_of_extents):`
`70`	`143`	`extent=list_of_extents[i]`
`71`	`144`	`rangedict=self._tree[extent]`
`72`		`- #iterableview = rangedict.items()`
`73`	`145`	`list_of_ranges=sorted(rangedict.keys())`
`74`	`146`	`for j,myrange in enumerate(list_of_ranges):`
`75`	`147`	`if j ==0:`
`76`	`148`	`continue`
`77`		`- #myrange,myset=mytuple`
`78`		`- myset=set(rangedict[myrange])`
`79`		`- result = set(rangedict[list_of_ranges[j-1]])^myset`
`80`		`- rangedict[myrange]=list(result)`
	`149`	`+ #the upcoming extent is used by the snapshots symmetrical difference of set of snapshots`
	`150`	`+ myset=set(return_snapshots(rangedict[myrange]))`
	`151`	`+ result = set(return_snapshots(rangedict[list_of_ranges[j-1]]))^myset`
	`152`	`+ #again store the ressult in array, not list.`
	`153`	`+ subvol_list=array.array('Q')`
	`154`	`+ for subvol in result:`
	`155`	`+ data= return_coded(rangedict[myrange],subvol)`
	`156`	`+ if data ==None:`
	`157`	`+ data=return_coded(rangedict[list_of_ranges[j-1]],subvol)`
	`158`	`+ if data ==None:`
	`159`	`+ print("problem!",data,subvol)`
	`160`	`+ subvol_list.append(data)`
	`161`	`+ rangedict[myrange]=subvol_list`
`81`	`162`	`self._tree[extent]=rangedict`
`82`	`163`	`i+=1`
`83`	`164`
`@@ -99,29 +180,40 @@ def __len__(self):`
`99`	`180`	`return result`
`100`	`181`
`101`	`182`	`#find those ranges that have only one snapshot, if this snapshot is deleted`
`102`		`- #this space will be freed`
`103`		`- def find_unique(self):`
	`183`	`+ #this space will be freed.`
	`184`	`+ #based on the scenario of transform is should return:`
	`185`	`+ #result[tree1]=pos2-pos1+pos4-pos3`
	`186`	`+ #result[tree2]=0`
	`187`	`+ #if files are analyzed use the inode data to find them ans store them in different dictionary.`
	`188`	`+ def find_unique(self,fs,analyze_file):`
`104`	`189`	`result=Counter()`
	`190`	`+ result_data=defaultdict(set)`
`105`	`191`	`for extent,rangedict in self._tree.items():`
`106`	`192`	`iterableview = sorted(rangedict.items())`
`107`	`193`	`for i,mytuple in enumerate(iterableview):`
`108`		`- myrange,myset=mytuple`
	`194`	`+ myrange,unique_pair_list=mytuple`
`109`	`195`	`#myset=list(myset)`
`110`		`- if len(myset)==1:`
`111`		`- try:`
`112`		`- size=iterableview[i+1][0]-myrange`
`113`		`- result[myset[0]]+=size`
`114`		`- except:`
`115`		`- print(extent,rangedict,mytuple)`
`116`		`- return result`
`117`		`-`
`118`		`- #helper function to find the size of the ranges that have the desired snapshots`
	`196`	`+ if len(unique_pair_list)==1:`
	`197`	`+ subvolume,inode=unique_to_pair(unique_pair_list[0])`
	`198`	`+ size=iterableview[i+1][0]-myrange`
	`199`	`+ result[subvolume]+=size`
	`200`	`+ #result[myset[0]]+=size`
	`201`	`+ #print(inode)`
	`202`	`+ if analyze_file:`
	`203`	`+ try:`
	`204`	`+ file=btrfs.ioctl.ino_lookup(fs.fd,subvolume,inode)`
	`205`	`+ result_data[file.name_bytes.decode('utf-8')].add(subvolume)`
	`206`	`+ except:`
	`207`	`+ print("Inode not found",inode)`
	`208`	`+ return result,result_data`
	`209`	`+`
	`210`	`+ #helper function to find the size of the extend ranges that have the desired snapshots`
`119`	`211`	`def find_snapshots_size(self,wanted,not_wanted):`
`120`	`212`	`result=0`
`121`	`213`	`for extent,rangedict in self._tree.items():`
`122`	`214`	`rangelist = sorted(rangedict.keys())`
`123`	`215`	`for i,myrange in enumerate(rangelist):`
`124`		`- snapshots=set(rangedict[myrange])`
	`216`	`+ snapshots=set(return_snapshots(rangedict[myrange]))`
`125`	`217`	`if len(set(wanted) & snapshots)>0 and len(set(not_wanted) & snapshots) ==0:`
`126`	`218`	`try:`
`127`	`219`	`result+=rangelist[i+1]-myrange`
`@@ -130,10 +222,12 @@ def find_snapshots_size(self,wanted,not_wanted):`
`130`	`222`	`print(extent,sorted(rangedict.items()),myrange)`
`131`	`223`	`return result`
`132`	`224`
	`225`	`+ #the active subvolume must be the last one`
`133`	`226`	`def add_snapshots(self,snapshots):`
`134`	`227`	`self._snapshots=snapshots.copy()`
`135`	`228`
`136`	`229`	`#calculate the size of ranges ontop of the previous subvolume`
	`230`	`+ #older subvolumes must be first in subvolume list`
`137`	`231`	`def find_snapshot_size_to_previous(self):`
`138`	`232`	`results=Counter()`
`139`	`233`	`for i, snapshot in enumerate(self._snapshots):`
`@@ -154,54 +248,84 @@ def find_snapshot_size_to_current(self):`
`154`	`248`	`results[snapshot]+=self.find_snapshots_size([snapshot],[current])`
`155`	`249`	`return results`
`156`	`250`
`157`		`-`
	`251`	`+#main function to parse data from disk and add the to the tree of extents`
`158`	`252`	`def disk_parse(data_tree,fs,tree):`
`159`	`253`	`print("Parsing subvolume:",tree)`
`160`		`- for header, data in btrfs.ioctl.search_v2(fs.fd, tree):`
	`254`	`+ min_key=btrfs.ctree.Key(0,btrfs.ctree.EXTENT_DATA_KEY,0)`
	`255`	`+ for header, data in btrfs.ioctl.search_v2(fs.fd, tree,min_key):`
`161`	`256`	`if header.type == btrfs.ctree.EXTENT_DATA_KEY:`
`162`	`257`	`datum=btrfs.ctree.FileExtentItem(header,data)`
`163`	`258`	`if datum.type != btrfs.ctree.FILE_EXTENT_INLINE:# and datum.disk_bytenr !=0:`
`164`	`259`	`key=(datum.disk_bytenr,datum.disk_num_bytes)`
`165`	`260`	`stop=datum.offset+datum.num_bytes`
`166`		`- data_tree.add(tree,key,datum.offset,stop)`
`167`		`-`
	`261`	`+ data_tree.add(tree,key,datum.offset,stop,datum.key.objectid)`
	`262`	`+`
`168`	`263`
`169`	`264`	`def main():`
`170`	`265`	`parser = argparse.ArgumentParser()`
`171`		`- parser.add_argument("-u","--unique",action='store_true',help="calculate only unique data, -r makes no sense")`
	`266`	`+ parser.add_argument("-u","--unique",action='store_true',help="calculate only unique data, -r argument makes no sense if -u is active")`
	`267`	`+ parser.add_argument("-f","--files",action='store_true',help="find filenames that exist in unique extents")`
`172`	`268`	`parser.add_argument("path", type=str,`
`173`	`269`	`help="path of the btrfs filesystem")`
`174`	`270`	`parser.add_argument("-r", "--root", type=int,default=5,`
`175`		`- help="current active subvolume to analyze, default is 5")`
`176`		`- parser.add_argument('subvolume', nargs='*', type=int, help='Do not analyze these subvolumes')`
	`271`	`+ help="current active subvolume to analyze first, default is 5")`
	`272`	`+ group = parser.add_mutually_exclusive_group()`
	`273`	`+ group.add_argument('-i', '--ignore', action='store_true',help="Do not analyze the specified subvolumes")`
	`274`	`+ group.add_argument('-o', '--only', action='store_true',help="Analyze only the specified subvolumes")`
	`275`	`+ parser.add_argument('subvolume', nargs='*', type=int, help='Subvolumes to ingore or analyze')`
`177`	`276`	`args=parser.parse_args()`
`178`		`-`
`179`		`- #list of ignored subvolumes`
`180`		`- ignored_trees=set(args.subvolume)`
`181`		`- ignored_trees.add(args.root)`
	`277`	`+`
	`278`	`+ #find subvolumes to parse, make sure -r subvolume stays first`
	`279`	`+ parse_trees=[5]`
	`280`	`+ if args.root!=5:`
	`281`	`+ parse_trees=[args.root,5]`
`182`	`282`	`fs = btrfs.FileSystem(args.path)`
	`283`	`+ for subvol in fs.subvolumes():`
	`284`	`+ if subvol.key.objectid != args.root:`
	`285`	`+ parse_trees.append(subvol.key.objectid)`
	`286`	`+`
	`287`	`+ #these are the subvolumes specified by the user, these will be either ignored`
	`288`	`+ #or all the other subvolumes will be ingored`
	`289`	`+ special_subvolumes=set(args.subvolume)`
	`290`	`+`
	`291`	`+ #if no argument specified then assume that the user wanted to ingore the speficied subvolumes`
	`292`	`+ if args.ignore == False and args.only== False:`
	`293`	`+ args.ignore=True`
	`294`	`+`
	`295`	`+ #remove the unneeded subvolumes`
	`296`	`+ if args.ignore:`
	`297`	`+ for item in special_subvolumes:`
	`298`	`+ try:`
	`299`	`+ parse_trees.remove(item)`
	`300`	`+ except:`
	`301`	`+ pass`
	`302`	`+ else:`
	`303`	`+ for tree in parse_trees[:]:`
	`304`	`+ if tree not in special_subvolumes:`
	`305`	`+ parse_trees.remove(tree)`
`183`	`306`
`184`		`- #data_tree=sortedcontainers.SortedDict()`
`185`		`- #data_tree=dict()`
`186`	`307`	`data_tree=TreeWrapper()`
`187`		`- #data_tree=TreeWrapperSql()`
`188`		`- #data_tree=TreeWrapperCompress()`
`189`		`- snapshots=[]`
`190`		`-`
`191`		`- disk_parse(data_tree,fs,args.root)`
`192`		`- snapshots.append(args.root)`
`193`	`308`
`194`		`- for subvol in fs.subvolumes():`
`195`		`- tree = subvol.key.objectid`
`196`		`- if tree not in ignored_trees:`
`197`		`- disk_parse(data_tree,fs,tree)`
`198`		`- snapshots.append(tree)`
`199`		`- changed_snapshots = deque(snapshots)`
	`309`	`+ #move the root subvolume in the end`
	`310`	`+ #older subvolumes must be first`
	`311`	`+ changed_snapshots = deque(parse_trees)`
`200`	`312`	`changed_snapshots.rotate(-1)`
`201`		`- data_tree.add_snapshots(list(changed_snapshots))`
	`313`	`+ parse_trees=list(changed_snapshots)`
	`314`	`+ data_tree.add_snapshots(parse_trees)`
	`315`	`+`
	`316`	`+ #parse the trees from newer to older`
	`317`	`+ parse_trees=list(reversed(parse_trees))`
	`318`	`+ print("Subvolumes to parse:",parse_trees)`
	`319`	`+ for tree in parse_trees:`
	`320`	`+ disk_parse(data_tree,fs,tree)`
	`321`	`+`
`202`	`322`	`data_tree.transform()`
	`323`	`+ #print(unique_number.cache_info())`
	`324`	`+ #print(unique_to_pair.cache_info())`
	`325`	`+ #print(compare_pair_to_snapshot.cache_info())`
`203`	`326`	`unique_sum=0`
`204`		`- unique_data=data_tree.find_unique()`
	`327`	`+ unique_data,files=data_tree.find_unique(fs,args.files)`
	`328`	`+ #if unique analysis is only needed, do not calculate differences`
`205`	`329`	`if args.unique:`
`206`	`330`	`current_data=Counter()`
`207`	`331`	`previous_data=Counter()`
`@@ -212,11 +336,16 @@ def main():`
`212`	`336`	`print(" per subvolume of previous subvolume current(act) subvolume")`
`213`	`337`	`print("---------------------\|---------------------\|----------------------")`
`214`	`338`	`print("SubvolumId Size Size Size")`
`215`		`- for snapshot in reversed(changed_snapshots):`
	`339`	`+ for snapshot in parse_trees:`
`216`	`340`	`print("{:>10} {:>10} {:>10} {:>10}".format(snapshot,btrfs.utils.pretty_size(unique_data[snapshot]),btrfs.utils.pretty_size(previous_data[snapshot]),btrfs.utils.pretty_size(current_data[snapshot])))`
	`341`	`+ #print(files[snapshot])`
`217`	`342`	`unique_sum+=unique_data[snapshot]`
`218`	`343`	`print("Size/Cost of subvolumes:",btrfs.utils.pretty_size(unique_sum),"Volatility:","{:.2%}".format(unique_sum/len(data_tree)))`
`219`		`-`
	`344`	`+ if args.files:`
	`345`	`+ print()`
	`346`	`+ print("Possible Unique Files:")`
	`347`	`+ for file,myset in files.items():`
	`348`	`+ print(file,":",myset)`
`220`	`349`
`221`	`350`	`if __name__ == '__main__':`
`222`	`351`	`main()`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit 591d96b

File tree

1 file changed

1 file changed

`‎subvolume.py‎`

0 commit comments