I have written a Python program to parse the Master File Table located by //?/X:/$MFT
.
Of course trying to open it normally will result in PermissionDenied
. I got around this by open(f"//?/{drive}:", "rb")
, and get the starting cluster of $MFT and read the first 1024 bytes starting at 4096 * cluster count.
I wrote this program to parse all file records in $MFT and store them in memory, each file record has a length of 1024 bytes, and I am interested in their $STANDARD_INFORMATION, $ATTRIBUTE_LIST, $FILE_NAME and (non-resident) $DATA attributes, because I want to resolve file record segment addresses from D:\System Volume Information\Chkdsk\chkdskyyyymmddHHMMSS.log
to absolute file paths.
Because NTFS is very complex I won't bother describing how it works, I wrote all of the code entirely by myself, using this, this and this as references.
The code is incomplete, but the core functionalities are implemented and working properly, I am trying to organize the resultant information better and make it use less memory, I have only parsed a select few attributes and flags because they are what I wanted, and other attributes will make the objects use more memory.
I have determined that if I load all 2247424 entries in the target $MFT I can expect the program to use 3GiB 376MiB 562KiB 380B memory, and I want to make it use less memory.
import re
from datetime import datetime, timedelta
from typing import Callable, Generator, Mapping, NamedTuple, Sequence, Set
SHORT_PATH = re.compile(r"^\w{6}~\d+\.\w{3}$")
EPOCH = datetime(1601, 1, 1, 0, 0, 0)
FILE_GOOD = ("BAAD", "FILE")
UNITS = ("B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB", "RiB", "QiB")
BOOT_SECTOR = (
(0, 3, "assembly", 0),
(3, 11, "OEM_ID", 0),
(11, 13, "bytes/sector", 1),
(13, 14, "sectors/cluster", 1),
(21, 22, "media_descriptor", 0),
(24, 26, "sectors/track", 1),
(26, 28, "heads", 1),
(28, 32, "hidden_sectors", 1),
(40, 48, "total_sectors", 1),
(48, 56, "$MFT_cluster", 1),
(56, 64, "$MFTMirr_cluster", 1),
(64, 65, "FRS_length", 1),
(72, 80, "volume_serial_number", 1),
(84, 510, "bootstrap_code", 0),
(510, 512, "end-of-sector", 0),
)
ATTRDEF_FLAGS = ((2, "Indexed"), (64, "Resident"), (128, "Non-Resident"))
ATTRIBUTE_TYPES = {
16: "$STANDARD_INFORMATION",
32: "$ATTRIBUTE_LIST",
48: "$FILE_NAME",
64: "$OBJECT_ID",
80: "$SECURITY_DESCRIPTOR",
96: "$VOLUME_NAME",
112: "$VOLUME_INFORMATION",
128: "$DATA",
144: "$INDEX_ROOT",
160: "$INDEX_ALLOCATION",
176: "$BITMAP",
192: "$REPARSE_POINT",
208: "$EA_INFORMATION",
224: "$EA",
256: "$LOGGED_UTILITY_STREAM",
}
FILE_PERMISSIONS = (1, 2, 4, 32)
HEADER_FLAGS = (1, 2)
class Record_Header(NamedTuple):
Written: int
Hardlinks: int
Record_Size: int
Base_Record: int
Record_ID: int
In_Use: bool
Directory: bool
class NonResident_Attribute(NamedTuple):
Type: str
Real_Size: int
Name: str
Data_Runs: tuple
class Resident_Attribute(NamedTuple):
Type: str
Name: str
Attribute: NamedTuple
class File_Permissions(NamedTuple):
Read_Only: bool
Hidden: bool
System: bool
Archive: bool
class Standard_Information(NamedTuple):
File_Created: datetime
File_Modified: datetime
Record_Changed: datetime
Last_Access: datetime
File_Permissions: File_Permissions
class Attribute_List_Entry(NamedTuple):
Type: int
Base_Record: int
Name: str
class FileName(NamedTuple):
Parent_Record: int
Allocated_Size: int
Real_Size: int
Name: str
class File_Record(NamedTuple):
Header: Record_Header
Attributes: tuple
def parse_file_permissions(data: bytes) -> File_Permissions:
flag = parse_little_endian(data)
return File_Permissions(*(bool(flag & bit) for bit in FILE_PERMISSIONS))
def parse_NTFS_timestamp(data: bytes) -> datetime:
return EPOCH + timedelta(seconds=int.from_bytes(data, "little") * 1e-7)
def parse_sined_little_endian(data: bytes) -> int:
return (
-1 * (1 + sum((b ^ 0xFF) * (1 << i * 8) for i, b in enumerate(data)))
if data[-1] & 128
else int.from_bytes(data, "little")
)
def parse_little_endian(data: bytes) -> int:
return int.from_bytes(data, "little")
def get_attribute_name(data: bytes) -> str:
return ATTRIBUTE_TYPES[parse_little_endian(data)]
FILE_RECORD_HEADER = (
(16, 18, parse_little_endian),
(18, 20, parse_little_endian),
(24, 28, parse_little_endian),
(32, 38, parse_little_endian),
(44, 48, parse_little_endian),
)
NONRESIDENT_HEADER = (
(0, 4, get_attribute_name),
(48, 56, parse_little_endian),
)
A1_STANDARD_INFORMATION = (
(0, 8, parse_NTFS_timestamp),
(8, 16, parse_NTFS_timestamp),
(16, 24, parse_NTFS_timestamp),
(24, 32, parse_NTFS_timestamp),
(32, 36, parse_file_permissions),
)
A2_ATTRIBUTE_LIST = (
(0, 4, get_attribute_name),
(16, 22, parse_little_endian),
)
A3_FILENAME = (
(0, 6, parse_little_endian),
(40, 48, parse_little_endian),
(48, 56, parse_little_endian),
)
def parse_standard_information(data: bytes) -> Standard_Information:
return Standard_Information(
*(func(data[start:end]) for start, end, func in A1_STANDARD_INFORMATION)
)
def parse_attribute_list_gen(data: bytes) -> Generator[Attribute_List_Entry, None, None]:
while len(data) > 26:
offset = 26 + 2 * data[6]
yield Attribute_List_Entry(
*(func(data[start:end]) for start, end, func in A2_ATTRIBUTE_LIST),
data[26:offset:2].decode(),
)
data = data[((offset + 7) >> 3) << 3 :]
def parse_attribute_list(data: bytes) -> tuple:
return tuple(parse_attribute_list_gen(data))
def parse_filename(data: bytes) -> FileName:
name = data[66 : 66 + 2 * data[64]]
try:
name = name.decode("utf8").replace("\x00", "")
except UnicodeDecodeError:
name = name.decode("utf-16-le").replace("\x00", "")
return FileName(*(func(data[start:end]) for start, end, func in A3_FILENAME), name)
def parse_record_header(data: bytes) -> Record_Header:
flag = parse_little_endian(data[22:24])
return Record_Header(
*(func(data[start:end]) for start, end, func in FILE_RECORD_HEADER),
flag & 1,
flag & 2,
)
def ignore():
pass
ATTRIBUTE_PARSERS = {
"$STANDARD_INFORMATION": parse_standard_information,
"$ATTRIBUTE_LIST": parse_attribute_list,
"$FILE_NAME": parse_filename,
"$DATA": ignore,
}
def format_size(length: int) -> str:
string = ""
i = 0
while length and i < 10:
chunk = length & 1023
length >>= 10
if chunk:
string = f"{chunk}{UNITS[i]} {string}"
i += 1
if length:
string = f"{length}QiB {string}"
return string.rstrip()
def process_boot_sector(data: dict) -> None:
data["raw_size"] = size = data["bytes/sector"] * data["total_sectors"]
data["readable_size"] = format_size(size)
data["bytes/cluster"] = cluster = data["bytes/sector"] * data["sectors/cluster"]
data["$MFT_index"] = data["$MFT_cluster"] * cluster
frs_length = data["FRS_length"]
if frs_length < 128:
data["FRS_length"] = frs_length * cluster
else:
data["FRS_length"] = 1 << (256 - frs_length)
def open_partition(drive: str) -> dict:
partition = open(f"//?/{drive}:", "rb")
sector = partition.read(512)
decoded = {}
for start, end, name, little in BOOT_SECTOR:
data = sector[start:end]
if little:
data = int.from_bytes(data, "little")
decoded[name] = data
process_boot_sector(decoded)
partition.seek(0)
return {
"handle": partition,
"info": decoded,
}
def decode_attrdef_name(data: bytes) -> str:
return bytes(b for b in data if b).decode("utf8")
def decode_attrdef_flags(data: bytes) -> list:
flag = int.from_bytes(data, "little")
return [name for bit, name in ATTRDEF_FLAGS if flag & bit]
def parse_attrdef(data: bytes) -> dict:
result = {}
while data[0]:
result[parse_little_endian(data[128:132])] = decode_attrdef_name(data[:128])
data = data[160:]
return result
def preprocess_file_record(data: bytes) -> bytes:
if data[:4] != b"FILE":
raise ValueError("File record is corrupt")
token = data[48:50]
if token != data[510:512] or token != data[1022:1024]:
raise ValueError("File record is corrupt")
update_sequence = data[50:54]
return data[:510] + update_sequence[:2] + data[512:1022] + update_sequence[2:]
def parse_data_runs(data: bytes) -> Generator[tuple, None, None]:
while (size := data[0]) and len(data) > 2:
count = (size & 15) + 1
first = (size >> 4) + count
cluster_count = parse_little_endian(data[1:count])
starting_cluster = parse_little_endian(data[count:first])
data = data[first:]
yield (starting_cluster, cluster_count)
def parse_nonresident(data: bytes, limit: int) -> list:
name_offset = parse_little_endian(data[10:12])
offset = parse_little_endian(data[32:34])
name = data[name_offset:offset:2].decode() if data[9] else ""
return NonResident_Attribute(
*(func(data[start:end]) for start, end, func in NONRESIDENT_HEADER),
name,
tuple(parse_data_runs(data[offset:limit])),
)
def parse_resident(
data: bytes, attribute_type: str, parser: Callable
) -> Resident_Attribute:
length = parse_little_endian(data[16:20])
name_offset = parse_little_endian(data[10:12])
offset = parse_little_endian(data[20:22])
name = data[name_offset:offset:2].decode() if data[9] else ""
return Resident_Attribute(
attribute_type,
name,
parser(data[offset : offset + length]),
)
def parse_record_attributes(data: bytes) -> Generator:
data = data[56:]
while data[:4] != b"\xff\xff\xff\xff":
length = parse_little_endian(data[4:8])
attribute_type = get_attribute_name(data[:4])
if func := ATTRIBUTE_PARSERS.get(attribute_type):
if data[8]:
yield parse_nonresident(data, length)
elif func is not ignore:
attribute = parse_resident(data, attribute_type, func)
if not (
attribute.Type == "$FILE_NAME"
and SHORT_PATH.match(attribute.Attribute.Name)
):
yield attribute
data = data[length:]
def parse_file_record(data: bytes) -> File_Record:
data = preprocess_file_record(data)
return File_Record(parse_record_header(data), tuple(parse_record_attributes(data)))
def get_size(obj: object) -> int:
size = obj.__sizeof__()
if isinstance(obj, Mapping):
size += sum(get_size(k) + get_size(v) for k, v in obj.items())
elif isinstance(obj, Sequence | Set) and not isinstance(obj, str):
size += sum(get_size(e) for e in obj)
return size
if __name__ == "__main__":
partition = open_partition("D")
handle = partition["handle"]
info = partition["info"]
frs_length = info["FRS_length"]
handle.seek(info["$MFT_index"])
records = []
for _ in range(65536):
data = handle.read(1024)
records.append(parse_file_record(data))
print(format_size(get_size(records) // 65536))
Example records:
In [2]: records[:16]
Out[2]:
[File_Record(Header=Record_Header(Written=1, Hardlinks=1, Record_Size=408, Base_Record=0, Record_ID=0, In_Use=1, Directory=0), Attributes=(Resident_Attribute(Type='$STANDARD_INFORMATION', Name='', Attribute=Standard_Information(File_Created=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), File_Modified=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), Record_Changed=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), Last_Access=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), File_Permissions=File_Permissions(Read_Only=False, Hidden=True, System=True, Archive=False))), Resident_Attribute(Type='$FILE_NAME', Name='', Attribute=FileName(Parent_Record=5, Allocated_Size=2301362176, Real_Size=2301362176, Name='$MFT')), NonResident_Attribute(Type='$DATA', Real_Size=2301362176, Name='', Data_Runs=((2, 561856),)))),
File_Record(Header=Record_Header(Written=1, Hardlinks=1, Record_Size=344, Base_Record=0, Record_ID=1, In_Use=1, Directory=0), Attributes=(Resident_Attribute(Type='$STANDARD_INFORMATION', Name='', Attribute=Standard_Information(File_Created=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), File_Modified=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), Record_Changed=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), Last_Access=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), File_Permissions=File_Permissions(Read_Only=False, Hidden=True, System=True, Archive=False))), Resident_Attribute(Type='$FILE_NAME', Name='', Attribute=FileName(Parent_Record=5, Allocated_Size=4096, Real_Size=4096, Name='$MFTMirr')), NonResident_Attribute(Type='$DATA', Real_Size=4096, Name='', Data_Runs=((171046397, 1),)))),
File_Record(Header=Record_Header(Written=2, Hardlinks=1, Record_Size=344, Base_Record=0, Record_ID=2, In_Use=1, Directory=0), Attributes=(Resident_Attribute(Type='$STANDARD_INFORMATION', Name='', Attribute=Standard_Information(File_Created=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), File_Modified=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), Record_Changed=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), Last_Access=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), File_Permissions=File_Permissions(Read_Only=False, Hidden=True, System=True, Archive=False))), Resident_Attribute(Type='$FILE_NAME', Name='', Attribute=FileName(Parent_Record=5, Allocated_Size=67108864, Real_Size=67108864, Name='$LogFile')), NonResident_Attribute(Type='$DATA', Real_Size=67108864, Name='', Data_Runs=((573699706, 16384),)))),
File_Record(Header=Record_Header(Written=3, Hardlinks=1, Record_Size=416, Base_Record=0, Record_ID=3, In_Use=1, Directory=0), Attributes=(Resident_Attribute(Type='$STANDARD_INFORMATION', Name='', Attribute=Standard_Information(File_Created=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), File_Modified=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), Record_Changed=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), Last_Access=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), File_Permissions=File_Permissions(Read_Only=False, Hidden=True, System=True, Archive=False))), Resident_Attribute(Type='$FILE_NAME', Name='', Attribute=FileName(Parent_Record=5, Allocated_Size=0, Real_Size=0, Name='$Volume')))),
File_Record(Header=Record_Header(Written=4, Hardlinks=1, Record_Size=344, Base_Record=0, Record_ID=4, In_Use=1, Directory=0), Attributes=(Resident_Attribute(Type='$STANDARD_INFORMATION', Name='', Attribute=Standard_Information(File_Created=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), File_Modified=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), Record_Changed=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), Last_Access=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), File_Permissions=File_Permissions(Read_Only=False, Hidden=True, System=True, Archive=False))), Resident_Attribute(Type='$FILE_NAME', Name='', Attribute=FileName(Parent_Record=5, Allocated_Size=4096, Real_Size=2560, Name='$AttrDef')), NonResident_Attribute(Type='$DATA', Real_Size=2560, Name='', Data_Runs=((165002433, 1),)))),
File_Record(Header=Record_Header(Written=5, Hardlinks=1, Record_Size=768, Base_Record=0, Record_ID=5, In_Use=1, Directory=2), Attributes=(Resident_Attribute(Type='$STANDARD_INFORMATION', Name='', Attribute=Standard_Information(File_Created=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), File_Modified=datetime.datetime(2024, 1, 29, 14, 58, 44, 660774), Record_Changed=datetime.datetime(2024, 1, 29, 14, 58, 44, 660774), Last_Access=datetime.datetime(2024, 1, 29, 14, 58, 45, 661135), File_Permissions=File_Permissions(Read_Only=False, Hidden=True, System=True, Archive=False))), Resident_Attribute(Type='$FILE_NAME', Name='', Attribute=FileName(Parent_Record=5, Allocated_Size=0, Real_Size=0, Name='.')))),
File_Record(Header=Record_Header(Written=6, Hardlinks=1, Record_Size=344, Base_Record=0, Record_ID=6, In_Use=1, Directory=0), Attributes=(Resident_Attribute(Type='$STANDARD_INFORMATION', Name='', Attribute=Standard_Information(File_Created=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), File_Modified=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), Record_Changed=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), Last_Access=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), File_Permissions=File_Permissions(Read_Only=False, Hidden=True, System=True, Archive=False))), Resident_Attribute(Type='$FILE_NAME', Name='', Attribute=FileName(Parent_Record=5, Allocated_Size=122044416, Real_Size=122044128, Name='$Bitmap')), NonResident_Attribute(Type='$DATA', Real_Size=122044128, Name='', Data_Runs=((171046398, 24577), (4124482825, 5219))))),
File_Record(Header=Record_Header(Written=7, Hardlinks=1, Record_Size=440, Base_Record=0, Record_ID=7, In_Use=1, Directory=0), Attributes=(Resident_Attribute(Type='$STANDARD_INFORMATION', Name='', Attribute=Standard_Information(File_Created=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), File_Modified=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), Record_Changed=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), Last_Access=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), File_Permissions=File_Permissions(Read_Only=False, Hidden=True, System=True, Archive=False))), Resident_Attribute(Type='$FILE_NAME', Name='', Attribute=FileName(Parent_Record=5, Allocated_Size=8192, Real_Size=8192, Name='$Boot')), NonResident_Attribute(Type='$DATA', Real_Size=8192, Name='', Data_Runs=((0, 2),)))),
File_Record(Header=Record_Header(Written=8, Hardlinks=1, Record_Size=376, Base_Record=0, Record_ID=8, In_Use=1, Directory=0), Attributes=(Resident_Attribute(Type='$STANDARD_INFORMATION', Name='', Attribute=Standard_Information(File_Created=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), File_Modified=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), Record_Changed=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), Last_Access=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), File_Permissions=File_Permissions(Read_Only=False, Hidden=True, System=True, Archive=False))), Resident_Attribute(Type='$FILE_NAME', Name='', Attribute=FileName(Parent_Record=5, Allocated_Size=0, Real_Size=0, Name='$BadClus')), NonResident_Attribute(Type='$DATA', Real_Size=3999141965824, Name='$Bad', Data_Runs=((0, 976353019),)))),
File_Record(Header=Record_Header(Written=9, Hardlinks=1, Record_Size=968, Base_Record=0, Record_ID=9, In_Use=1, Directory=0), Attributes=(Resident_Attribute(Type='$STANDARD_INFORMATION', Name='', Attribute=Standard_Information(File_Created=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), File_Modified=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), Record_Changed=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), Last_Access=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), File_Permissions=File_Permissions(Read_Only=False, Hidden=True, System=True, Archive=False))), Resident_Attribute(Type='$FILE_NAME', Name='', Attribute=FileName(Parent_Record=5, Allocated_Size=0, Real_Size=0, Name='$Secure')), NonResident_Attribute(Type='$DATA', Real_Size=319444, Name='$SDS', Data_Runs=((684234, 78),)))),
File_Record(Header=Record_Header(Written=10, Hardlinks=1, Record_Size=408, Base_Record=0, Record_ID=10, In_Use=1, Directory=0), Attributes=(Resident_Attribute(Type='$STANDARD_INFORMATION', Name='', Attribute=Standard_Information(File_Created=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), File_Modified=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), Record_Changed=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), Last_Access=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), File_Permissions=File_Permissions(Read_Only=False, Hidden=True, System=True, Archive=False))), Resident_Attribute(Type='$FILE_NAME', Name='', Attribute=FileName(Parent_Record=5, Allocated_Size=131072, Real_Size=131072, Name='$UpCase')), NonResident_Attribute(Type='$DATA', Real_Size=131072, Name='', Data_Runs=((158628201, 32),)))),
File_Record(Header=Record_Header(Written=11, Hardlinks=1, Record_Size=952, Base_Record=0, Record_ID=11, In_Use=1, Directory=2), Attributes=(Resident_Attribute(Type='$STANDARD_INFORMATION', Name='', Attribute=Standard_Information(File_Created=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), File_Modified=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), Record_Changed=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), Last_Access=datetime.datetime(2023, 3, 13, 17, 37, 23, 724527), File_Permissions=File_Permissions(Read_Only=False, Hidden=True, System=True, Archive=False))), Resident_Attribute(Type='$FILE_NAME', Name='', Attribute=FileName(Parent_Record=5, Allocated_Size=0, Real_Size=0, Name='$Extend')))),
File_Record(Header=Record_Header(Written=12, Hardlinks=0, Record_Size=288, Base_Record=0, Record_ID=12, In_Use=1, Directory=0), Attributes=(Resident_Attribute(Type='$STANDARD_INFORMATION', Name='', Attribute=Standard_Information(File_Created=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), File_Modified=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), Record_Changed=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), Last_Access=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), File_Permissions=File_Permissions(Read_Only=False, Hidden=True, System=True, Archive=False))),)),
File_Record(Header=Record_Header(Written=13, Hardlinks=0, Record_Size=288, Base_Record=0, Record_ID=13, In_Use=1, Directory=0), Attributes=(Resident_Attribute(Type='$STANDARD_INFORMATION', Name='', Attribute=Standard_Information(File_Created=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), File_Modified=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), Record_Changed=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), Last_Access=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), File_Permissions=File_Permissions(Read_Only=False, Hidden=True, System=True, Archive=False))),)),
File_Record(Header=Record_Header(Written=14, Hardlinks=0, Record_Size=288, Base_Record=0, Record_ID=14, In_Use=1, Directory=0), Attributes=(Resident_Attribute(Type='$STANDARD_INFORMATION', Name='', Attribute=Standard_Information(File_Created=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), File_Modified=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), Record_Changed=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), Last_Access=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), File_Permissions=File_Permissions(Read_Only=False, Hidden=True, System=True, Archive=False))),)),
File_Record(Header=Record_Header(Written=15, Hardlinks=0, Record_Size=288, Base_Record=0, Record_ID=15, In_Use=1, Directory=0), Attributes=(Resident_Attribute(Type='$STANDARD_INFORMATION', Name='', Attribute=Standard_Information(File_Created=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), File_Modified=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), Record_Changed=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), Last_Access=datetime.datetime(2021, 3, 21, 9, 41, 47, 819000), File_Permissions=File_Permissions(Read_Only=False, Hidden=True, System=True, Archive=False))),))]
How can I improve this program?
Edit
Minor oversight, in the original code $ATTRIBUTE_LIST
entries won't be parsed but a generator object will be created, the data isn't stored in memory and this is unintended. I have fixed it.
-
\$\begingroup\$ I don't see a way around using at least as much memory as the data structures themselves. 2 things that stand out to me are: packed structs in Cython, and using a generator approach if you can seek records in the MFT \$\endgroup\$Tamoghna Chowdhury– Tamoghna Chowdhury2024年01月30日 09:27:55 +00:00Commented Jan 30, 2024 at 9:27
-
\$\begingroup\$ Have you tried any of the off-the-shelf options - pyMFTGrabber, analyzeMFT, etc.? \$\endgroup\$Reinderien– Reinderien2024年01月31日 09:13:51 +00:00Commented Jan 31, 2024 at 9:13
-
\$\begingroup\$ @Reinderien I have looked into them before I wrote this program and they aren't looking great, all those I have investigated were written for Python 2 and their code quality is very poor, at least much poorer than mine. And that is why I wrote this program. And I need the raw data in memory, I am not looking for human readable serialization because I need the data for further processing. \$\endgroup\$Ξένη Γήινος– Ξένη Γήινος2024年01月31日 09:36:48 +00:00Commented Jan 31, 2024 at 9:36
1 Answer 1
In UNITS
you should split the IEC prefix (Mi, Gi, etc.) away from the unit B
.
BOOT_SECTOR
is not structured enough - it's entirely unclear what those four fields are, and this could be fixed by constructing from each of those sequences a named tuple. Further, though, those seem like field offsets with names, so you should throw the whole thing away and use a proper ctypes
structure definition. This pattern occurs multiple times through your code. The theme is - reduce the amount of magic-number offsets in your code, and move away from imperative decoding and toward declarative decoding.
timedelta(seconds=int.from_bytes(data, "little") * 1e-7)
is incorrectly using seconds
and a large multiplier, when it should use microseconds
and a smaller multiplier.
Never write -1 *
; use a unary negation -
instead.
parse_sined
should be spelled parse_signed
. More importantly, that method is doing a lot of manual bit-munging that should be deleted and replaced with proper ctypes
and struct
unpacking calls.
In format_size
, re-formatting of the style string = f'{string}'
is not efficient. Ideally this would be split into two functions - one a generator yielding string fragments, and the outer function performing a join
.
Why would you rstrip
on a string you've constructed yourself?
Use of data: dict
is poorly typed for a few reasons. First, don't use dict
without specifying its inner types. More importantly, don't use dict
at all; pass around immutable structures. "Processing" a data dictionary by mutating it after only half of it has been constructed is an anti-pattern; typical solutions are to split it into two objects, or perform construction of all of the values at once in a @classmethod
.
There's more, but that (especially use of ctypes
) will go a long way.