The first lines of the rarpath
module (see here for context, the rest is not used here):
import enum
import re
import typing
from pathlib import Path
class RarVersion(enum.IntEnum):
AMBIGUOUS = 0
V3 = 3
V5 = 5
And shared
module:
import pathlib
SEVENZIP = pathlib.Path("C:\Program Files7円-Zip7円z.exe")
The rarutils
module:
import dataclasses
import enum
import logging
import os
import pathlib
import re
import subprocess
import typing
import rarpath
from shared import SEVENZIP
logger = logging.getLogger(__name__)
class Algo(enum.IntEnum):
"""This class contains the possible hash algorithms."""
CRC32 = 1
MD5 = 2
SHA1 = 3
SHA256 = 4
SHA512 = 5
@dataclasses.dataclass
class FileEntry:
"""This class contains information about a file, either on disk or as part of an archive."""
path: pathlib.Path
size: int
is_dir: bool
hash_value: bytes | None = None
algo: Algo | None = None
@dataclasses.dataclass
class RarFile:
"""This class contains information about a RAR file."""
version: rarpath.RarVersion
path: pathlib.Path
files: list[FileEntry]
password: str | None = None
@classmethod
def from_path(
cls, archive_path: pathlib.Path, password: str | None = None
) -> typing.Self:
"""Create a RarFile object by reading information from a RAR archive given its path."""
infos = list_rar(archive_path, password)
type_entries = [entry for entry in infos if "Type" in entry]
if not type_entries or len(type_entries) > 1:
raise ValueError(f"No 'Type' entries found in {archive_path}")
if type_entries[0]["Type"] == "Rar3":
version = rarpath.RarVersion.V3
elif type_entries[0]["Type"] == "Rar5":
version = rarpath.RarVersion.V5
else:
raise ValueError(
f"Unknown RAR version {type_entries[0]['Type']} in {archive_path}"
)
files = []
for entry in infos:
if "Path" in entry and "Type" not in entry:
entry_path = pathlib.Path(entry["Path"])
size = int(entry["Size"])
is_dir = entry["Folder"] == "+"
hash_value = None
algo = None
if version == rarpath.RarVersion.V3:
# RAR5 hashes the file contents again with their respective mtimes,
# so the CRCs in the header are not useful for verification.
hash_value = bytes.fromhex(entry["CRC"]) if "CRC" in entry else None
algo = Algo.CRC32 if hash_value else None
files.append(FileEntry(entry_path, size, is_dir, hash_value, algo))
return cls(version, archive_path, files, password)
@property
def hash_values_exist(self) -> bool:
"""Check if *all* files already have hash values."""
return all(file.hash_value for file in self.files if not file.is_dir)
def update_hash_values(self):
"""Update the hash values of all files in the archive.
This will always use the slow method. """
for entry in self.files:
if not entry.hash_value and not entry.is_dir:
try:
crc = get_crc32_slow(
self.path, entry.path, self.password
) # used for V5, slow
entry.hash_value = crc
except subprocess.CalledProcessError:
logger.error(f"Failed to get CRC32 for {entry.path}")
def pretty_str(self) -> str:
"""Return a pretty string representation of RAR file info and its contents."""
maxlen_path = max(len(str(file.path)) for file in self.files)
maxlen_size = max(len(str(file.size)) for file in self.files)
maxlen_hash = 8 if any(file.hash_value for file in self.files) else 0
maxlen_algo = 5 if any(file.algo for file in self.files) else 0
ret = f"RAR file: {self.path}\n"
ret += f"Version: {self.version.name}\n"
ret += f"Encrypted: {self.password is not None}\n"
ret += f"Password: {self.password}\n\n"
for file in self.files:
hash_str = file.hash_value.hex() if file.hash_value else ""
algo_str = file.algo.name if file.algo else ""
ret += (
f" {str(file.path):<{maxlen_path}} {'D' if file.is_dir else 'F':1} "
f"{file.size:>{maxlen_size}} {hash_str:<{maxlen_hash}} {algo_str:<{maxlen_algo}}\n"
)
return ret
def get_crc32_slow(
archive_path: pathlib.Path | str,
entry_path: pathlib.Path | str,
password: str | None = None,
) -> bytes | None:
"""The slow method to get the CRC32 of a file in a RAR archive.
7z extracts files internally - this is necessary for RAR5 archives,
where we can't use the CRCs in the header."""
command_line = [
SEVENZIP,
"t",
"-scrc",
"-p" + (password or ""),
archive_path,
entry_path,
]
logger.debug(
f"Processing archive {archive_path} with path {entry_path} using password {password}"
)
sub = subprocess.run(command_line, capture_output=True, check=False)
if sub.returncode != 0:
logger.error(
f"Command line {' '.join(map(str, command_line))} returned {sub.returncode}"
)
return None
encoding = os.device_encoding(0)
if encoding:
lines = sub.stdout.decode(encoding, errors="ignore").splitlines()
else:
raise RuntimeError("Coud not get encoding")
crc_match = next(
(
m.group(1)
for line in lines
if (m := re.match(r".*CRC32.*data.*([A-F0-9]{8}).*", line))
),
None,
)
if not crc_match:
logger.debug(f"Failed to get CRC for {archive_path}: {entry_path}")
return None
logger.debug(f"Got CRC {crc_match} for {archive_path}: {entry_path}")
return bytes.fromhex(crc_match)
def list_rar(
archive: pathlib.Path, password: str | None = None
) -> list[dict[str, str]]:
"""Get an info list about the archive and its contents."""
logger.debug(f"Listing {archive}, using password {password}")
command_line = [
str(SEVENZIP),
"l",
"-slt",
"-p" + (password if password else ""),
str(archive),
]
sub = subprocess.run(command_line, capture_output=True, check=False)
if sub.returncode != 0:
logger.error(
f"Command line {' '.join(map(str, command_line))} returned {sub.returncode}"
)
return []
encoding = os.device_encoding(0)
if encoding:
entries = sub.stdout.decode(encoding, errors="ignore").split(2 * os.linesep)
else:
raise RuntimeError("Coud not get encoding")
ret = []
for entry in entries:
lines = entry.splitlines()
kv_pairs = [line.split("=", 1) for line in lines if "=" in line]
entry_dict = {k.strip(): v.strip() for k, v in kv_pairs if k.strip()}
if entry_dict:
ret.append(entry_dict)
logger.debug(f"Found {len(ret)} files in {archive}")
return ret
Some remarks:
The Algo
class and the FileEntry
class will later be moved to their own module, when support for SFV-files and the like is implemented (right now it's still redundant to have different hash types, since we only deal with CRC32).
For this application, it's essential that we get the standard hashes (that also every utility like rhash
delivers) from the RAR file. Which RAR v3 indeed uses in its headers, but not RAR v5, where the mtime of the file is included in the hash.
The main class RarFile
is to open a rar file and hold information mainly about its contents, i.e., the file paths, sizes and their hash_values. The slow method for RAR v5 will not be done without explicitly calling update_hash_values
.
And sure, a direct call to 7zip library (if it exists) via C-interface may be possible, but just opening the subprocess and reading stdout
was obviously easier.
3 Answers 3
When you write
"C:\Program Files7円-Zip7円z.exe"
you should use the r
string literal prefix to make it clear to the interpreter that you didn't intend to write escape sequences.
When you define a @dataclass
, consider settings slots=True
for performance and robustness reasons.
This:
f" {str(file.path)
does not need to str()
; string casts are implied in arguments to an interpolated string.
I discourage this pattern:
sub = subprocess.run(command_line, capture_output=True, check=False)
if sub.returncode != 0:
logger.error(
f"Command line {' '.join(map(str, command_line))} returned {sub.returncode}"
)
return None
You should check=True
within a try
. You have a logger (good!) so you can pass exc_info=True
during your except
. That means you could convert to check_output. You should also pass text=True
and not bother about the encoding stuff.
For logger calls like this:
logger.debug(f"Listing {archive}, using password {password}")
Don't do string interpolation. Use the logger's own variadic-argument percent-style, as in
logger.debug('Listing %s, using password %s', archive, password)
-
\$\begingroup\$ AFAIK using a property for size isn't possible. The paths in the
FileEntry
aren't necessarily paths of real files, but only the entries of theRarFile
(where the size was extracted). So I wonder if it is more correct to usePurePath
here, sincePath
is supposed to be for concrete paths. \$\endgroup\$Amaterasu– Amaterasu2025年01月30日 09:52:23 +00:00Commented Jan 30 at 9:52 -
\$\begingroup\$ @Amaterasu you're right! I dropped that part. \$\endgroup\$Reinderien– Reinderien2025年01月30日 12:00:35 +00:00Commented Jan 30 at 12:00
__str__
It is a common practice to use the __str__
function for printing a class. I assume that is the purpose of the pretty_str
function in your RarFile
class. Consider renaming the function as:
def __str__(self):
Once you've created a handle to the class, you can simply print
the handle without using the function name.
Naming
The variable named ret
in this function is not too descriptive. I understand it is sort of a placeholder that you are using for the returned value is some of your functions, but I think it would be better to be more specific. For example, in this function, using something like message
or text
might be more meaningful.
In addition to what has already be said, here are a couple suggestions for you to consider:
Encapsulation
Your module has functions get_crc32_slow
, list_rar
used by your RarFile
class and I am wondering why they are not be instance methods of RarFile
. Even if you think that these functions might have applicability beyond their use by RarFile
, since they are RAR file-related functions, wouldn't making then class methods at the very least be appropriate?
Why @dataclass
?
The @dataclass
decorator is adding many methods to your class that you do not seem to be using. At the same time it seems to me that a client of this class would most likely be always constructing an instance via the from_path
class method and you might even want to make that the exclusive way of constructing instances. But with the class defined as is, there is no way of preventing someone from constructing via the __init__
method that @dataclass
provides. Assuming you would prefer to discourage a client from directly using __init__
, then this would be one way of doing it:
# No @dataclass decorator:
class RarFile:
def __init__(self):
raise RuntimeError('Construct this class using class method from_path')
@classmethod
def from_path(
cls, archive_path: pathlib.Path, password: str | None = None
) -> typing.Self:
...
#return cls(version, archive_path, files, password)
instance = cls.__new__(cls)
instance.version = version
instance.path = archive_path
instance.files = files
instance.password = password
return instance
And, of course, as suggested by the OP, if we are no longer using the @dataclass
decorator we can incorporate the logic of from_path
class method into an __init__
constructor and get rid of from_path
altogether:
class RarFile:
def __init__(self, archive_path: pathlib.Path, password: str | None = None) -> typing.Self:
"""Create a RarFile object by reading information from a RAR archive given its path."""
infos = self.list_rar(archive_path, password) # Now an instance method
...
self.version = version
self.path = archive_path
self.files = files
self.password = password
-
\$\begingroup\$ yes,
get_crc32_slow
,list_rar
should be put into the RarFile class, don't make much sense outside of it. Regarding thefrom_path
... it could be an alternative put it all in__init__
(and use a normal class). \$\endgroup\$viuser– viuser2025年01月30日 22:37:39 +00:00Commented Jan 30 at 22:37 -
\$\begingroup\$ Yes, that would be a reasonable alternative. \$\endgroup\$Booboo– Booboo2025年01月30日 23:01:41 +00:00Commented Jan 30 at 23:01
-
\$\begingroup\$ Ok, after looking at similar code, I see that it's a bit unusual to put I/O operations in the constructor. There are use cases (e.g., unit tests) where you want to construct the object normally via
__init__
and pass the stuff it needs. So it's common to have that and an option to construct it directly via a class method that performs I/O. \$\endgroup\$viuser– viuser2025年02月02日 09:18:08 +00:00Commented Feb 2 at 9:18
Explore related questions
See similar questions with these tags.