Getting information about a RAR file's content

Question 1

The first lines of the rarpath module (see here for context, the rest is not used here):

import enum
import re
import typing
from pathlib import Path
class RarVersion(enum.IntEnum):
 AMBIGUOUS = 0
 V3 = 3
 V5 = 5

And shared module:

import pathlib
SEVENZIP = pathlib.Path("C:\Program Files7円-Zip7円z.exe")

The rarutils module:

import dataclasses
import enum
import logging
import os
import pathlib
import re
import subprocess
import typing
import rarpath
from shared import SEVENZIP
logger = logging.getLogger(__name__)
class Algo(enum.IntEnum):
 """This class contains the possible hash algorithms."""
 CRC32 = 1
 MD5 = 2
 SHA1 = 3
 SHA256 = 4
 SHA512 = 5
@dataclasses.dataclass
class FileEntry:
 """This class contains information about a file, either on disk or as part of an archive."""
 path: pathlib.Path
 size: int
 is_dir: bool
 hash_value: bytes | None = None
 algo: Algo | None = None
@dataclasses.dataclass
class RarFile:
 """This class contains information about a RAR file."""
 version: rarpath.RarVersion
 path: pathlib.Path
 files: list[FileEntry]
 password: str | None = None
 @classmethod
 def from_path(
 cls, archive_path: pathlib.Path, password: str | None = None
 ) -> typing.Self:
 """Create a RarFile object by reading information from a RAR archive given its path."""
 infos = list_rar(archive_path, password)
 type_entries = [entry for entry in infos if "Type" in entry]
 if not type_entries or len(type_entries) > 1:
 raise ValueError(f"No 'Type' entries found in {archive_path}")
 if type_entries[0]["Type"] == "Rar3":
 version = rarpath.RarVersion.V3
 elif type_entries[0]["Type"] == "Rar5":
 version = rarpath.RarVersion.V5
 else:
 raise ValueError(
 f"Unknown RAR version {type_entries[0]['Type']} in {archive_path}"
 )
 files = []
 for entry in infos:
 if "Path" in entry and "Type" not in entry:
 entry_path = pathlib.Path(entry["Path"])
 size = int(entry["Size"])
 is_dir = entry["Folder"] == "+"
 hash_value = None
 algo = None
 if version == rarpath.RarVersion.V3:
 # RAR5 hashes the file contents again with their respective mtimes,
 # so the CRCs in the header are not useful for verification.
 hash_value = bytes.fromhex(entry["CRC"]) if "CRC" in entry else None
 algo = Algo.CRC32 if hash_value else None
 files.append(FileEntry(entry_path, size, is_dir, hash_value, algo))
 return cls(version, archive_path, files, password)
 @property
 def hash_values_exist(self) -> bool:
 """Check if *all* files already have hash values."""
 return all(file.hash_value for file in self.files if not file.is_dir)
 def update_hash_values(self):
 """Update the hash values of all files in the archive.
 This will always use the slow method. """
 for entry in self.files:
 if not entry.hash_value and not entry.is_dir:
 try:
 crc = get_crc32_slow(
 self.path, entry.path, self.password
 ) # used for V5, slow
 entry.hash_value = crc
 except subprocess.CalledProcessError:
 logger.error(f"Failed to get CRC32 for {entry.path}")
 def pretty_str(self) -> str:
 """Return a pretty string representation of RAR file info and its contents."""
 maxlen_path = max(len(str(file.path)) for file in self.files)
 maxlen_size = max(len(str(file.size)) for file in self.files)
 maxlen_hash = 8 if any(file.hash_value for file in self.files) else 0
 maxlen_algo = 5 if any(file.algo for file in self.files) else 0
 ret = f"RAR file: {self.path}\n"
 ret += f"Version: {self.version.name}\n"
 ret += f"Encrypted: {self.password is not None}\n"
 ret += f"Password: {self.password}\n\n"
 for file in self.files:
 hash_str = file.hash_value.hex() if file.hash_value else ""
 algo_str = file.algo.name if file.algo else ""
 ret += (
 f" {str(file.path):<{maxlen_path}} {'D' if file.is_dir else 'F':1} "
 f"{file.size:>{maxlen_size}} {hash_str:<{maxlen_hash}} {algo_str:<{maxlen_algo}}\n"
 )
 return ret
def get_crc32_slow(
 archive_path: pathlib.Path | str,
 entry_path: pathlib.Path | str,
 password: str | None = None,
) -> bytes | None:
 """The slow method to get the CRC32 of a file in a RAR archive.
 7z extracts files internally - this is necessary for RAR5 archives,
 where we can't use the CRCs in the header."""
 command_line = [
 SEVENZIP,
 "t",
 "-scrc",
 "-p" + (password or ""),
 archive_path,
 entry_path,
 ]
 logger.debug(
 f"Processing archive {archive_path} with path {entry_path} using password {password}"
 )
 sub = subprocess.run(command_line, capture_output=True, check=False)
 if sub.returncode != 0:
 logger.error(
 f"Command line {' '.join(map(str, command_line))} returned {sub.returncode}"
 )
 return None
 encoding = os.device_encoding(0)
 if encoding:
 lines = sub.stdout.decode(encoding, errors="ignore").splitlines()
 else:
 raise RuntimeError("Coud not get encoding")
 crc_match = next(
 (
 m.group(1)
 for line in lines
 if (m := re.match(r".*CRC32.*data.*([A-F0-9]{8}).*", line))
 ),
 None,
 )
 if not crc_match:
 logger.debug(f"Failed to get CRC for {archive_path}: {entry_path}")
 return None
 logger.debug(f"Got CRC {crc_match} for {archive_path}: {entry_path}")
 return bytes.fromhex(crc_match)
def list_rar(
 archive: pathlib.Path, password: str | None = None
) -> list[dict[str, str]]:
 """Get an info list about the archive and its contents."""
 logger.debug(f"Listing {archive}, using password {password}")
 command_line = [
 str(SEVENZIP),
 "l",
 "-slt",
 "-p" + (password if password else ""),
 str(archive),
 ]
 sub = subprocess.run(command_line, capture_output=True, check=False)
 if sub.returncode != 0:
 logger.error(
 f"Command line {' '.join(map(str, command_line))} returned {sub.returncode}"
 )
 return []
 encoding = os.device_encoding(0)
 if encoding:
 entries = sub.stdout.decode(encoding, errors="ignore").split(2 * os.linesep)
 else:
 raise RuntimeError("Coud not get encoding")
 ret = []
 for entry in entries:
 lines = entry.splitlines()
 kv_pairs = [line.split("=", 1) for line in lines if "=" in line]
 entry_dict = {k.strip(): v.strip() for k, v in kv_pairs if k.strip()}
 if entry_dict:
 ret.append(entry_dict)
 logger.debug(f"Found {len(ret)} files in {archive}")
 return ret

Some remarks:

The Algo class and the FileEntry class will later be moved to their own module, when support for SFV-files and the like is implemented (right now it's still redundant to have different hash types, since we only deal with CRC32).

For this application, it's essential that we get the standard hashes (that also every utility like rhash delivers) from the RAR file. Which RAR v3 indeed uses in its headers, but not RAR v5, where the mtime of the file is included in the hash.

The main class RarFile is to open a rar file and hold information mainly about its contents, i.e., the file paths, sizes and their hash_values. The slow method for RAR v5 will not be done without explicitly calling update_hash_values.

And sure, a direct call to 7zip library (if it exists) via C-interface may be possible, but just opening the subprocess and reading stdout was obviously easier.

Question 2

When you write

"C:\Program Files7円-Zip7円z.exe"

you should use the r string literal prefix to make it clear to the interpreter that you didn't intend to write escape sequences.

When you define a @dataclass, consider settings slots=True for performance and robustness reasons.

This:

f" {str(file.path)

does not need to str(); string casts are implied in arguments to an interpolated string.

I discourage this pattern:

sub = subprocess.run(command_line, capture_output=True, check=False)
if sub.returncode != 0:
 logger.error(
 f"Command line {' '.join(map(str, command_line))} returned {sub.returncode}"
 )
 return None

You should check=True within a try. You have a logger (good!) so you can pass exc_info=True during your except. That means you could convert to check_output. You should also pass text=True and not bother about the encoding stuff.

For logger calls like this:

logger.debug(f"Listing {archive}, using password {password}")

Don't do string interpolation. Use the logger's own variadic-argument percent-style, as in

logger.debug('Listing %s, using password %s', archive, password)

Question 3

AFAIK using a property for size isn't possible. The paths in the FileEntry aren't necessarily paths of real files, but only the entries of the RarFile (where the size was extracted). So I wonder if it is more correct to use PurePath here, since Path is supposed to be for concrete paths.

Question 4

@Amaterasu you're right! I dropped that part.

Question 5

`str`

It is a common practice to use the __str__ function for printing a class. I assume that is the purpose of the pretty_str function in your RarFile class. Consider renaming the function as:

def __str__(self):

Once you've created a handle to the class, you can simply print the handle without using the function name.

Naming

The variable named ret in this function is not too descriptive. I understand it is sort of a placeholder that you are using for the returned value is some of your functions, but I think it would be better to be more specific. For example, in this function, using something like message or text might be more meaningful.

Question 6

In addition to what has already be said, here are a couple suggestions for you to consider:

Encapsulation

Your module has functions get_crc32_slow, list_rar used by your RarFile class and I am wondering why they are not be instance methods of RarFile. Even if you think that these functions might have applicability beyond their use by RarFile, since they are RAR file-related functions, wouldn't making then class methods at the very least be appropriate?

Why `@dataclass`?

The @dataclass decorator is adding many methods to your class that you do not seem to be using. At the same time it seems to me that a client of this class would most likely be always constructing an instance via the from_path class method and you might even want to make that the exclusive way of constructing instances. But with the class defined as is, there is no way of preventing someone from constructing via the __init__ method that @dataclass provides. Assuming you would prefer to discourage a client from directly using __init__, then this would be one way of doing it:

# No @dataclass decorator:
class RarFile:
 def __init__(self):
 raise RuntimeError('Construct this class using class method from_path')
 @classmethod
 def from_path(
 cls, archive_path: pathlib.Path, password: str | None = None
 ) -> typing.Self:
 ...
 #return cls(version, archive_path, files, password)
 instance = cls.__new__(cls)
 instance.version = version
 instance.path = archive_path
 instance.files = files
 instance.password = password
 return instance

And, of course, as suggested by the OP, if we are no longer using the @dataclass decorator we can incorporate the logic of from_path class method into an __init__ constructor and get rid of from_path altogether:

class RarFile:
 def __init__(self, archive_path: pathlib.Path, password: str | None = None) -> typing.Self:
 """Create a RarFile object by reading information from a RAR archive given its path."""
 infos = self.list_rar(archive_path, password) # Now an instance method
 ...
 self.version = version
 self.path = archive_path
 self.files = files
 self.password = password

Question 7

yes, get_crc32_slow, list_rar should be put into the RarFile class, don't make much sense outside of it. Regarding the from_path ... it could be an alternative put it all in __init__ (and use a normal class).

Question 8

Yes, that would be a reasonable alternative.

Question 9

Ok, after looking at similar code, I see that it's a bit unusual to put I/O operations in the constructor. There are use cases (e.g., unit tests) where you want to construct the object normally via __init__ and pass the stuff it needs. So it's common to have that and an option to construct it directly via a class method that performs I/O.

Reinderien Reinderien 71k5 gold badges76 silver badges256 bronze badges · Accepted Answer · 2025-01-30 01:58:50Z

When you write

"C:\Program Files7円-Zip7円z.exe"

you should use the r string literal prefix to make it clear to the interpreter that you didn't intend to write escape sequences.

When you define a @dataclass, consider settings slots=True for performance and robustness reasons.

This:

f" {str(file.path)

does not need to str(); string casts are implied in arguments to an interpolated string.

I discourage this pattern:

sub = subprocess.run(command_line, capture_output=True, check=False)
if sub.returncode != 0:
 logger.error(
 f"Command line {' '.join(map(str, command_line))} returned {sub.returncode}"
 )
 return None

You should check=True within a try. You have a logger (good!) so you can pass exc_info=True during your except. That means you could convert to check_output. You should also pass text=True and not bother about the encoding stuff.

For logger calls like this:

logger.debug(f"Listing {archive}, using password {password}")

Don't do string interpolation. Use the logger's own variadic-argument percent-style, as in

logger.debug('Listing %s, using password %s', archive, password)

AFAIK using a property for size isn't possible. The paths in the FileEntry aren't necessarily paths of real files, but only the entries of the RarFile (where the size was extracted). So I wonder if it is more correct to use PurePath here, since Path is supposed to be for concrete paths.

Stack Exchange Network

Getting information about a RAR file's content

3 Answers 3

`str`

Naming

Encapsulation

Why `@dataclass`?

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Linked

Hot Network Questions

Getting information about a RAR file's content

3 Answers 3

__str__

Naming

Encapsulation

Why @dataclass?

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Linked

Related

Hot Network Questions

`str`

Why `@dataclass`?