The structure of a class file consists of a single structure (presented here using pseudostructures written in a C-like structure notation):
ClassFile {
u4 magic;
u2 minor_version;
u2 major_version;
u2 constant_pool_count;
cp_info constant_pool[constant_pool_count-1];
u2 access_flags;
u2 this_class;
u2 super_class;
u2 interfaces_count;
u2 interfaces[interfaces_count];
u2 fields_count;
field_info fields[fields_count];
u2 methods_count;
method_info methods[methods_count];
u2 attributes_count;
attribute_info attributes[attributes_count];
}
The script parses the class file into a dictionary, except for the attributes, but does not verify that the file is valid/correct. (I did not deem that a good use of my time)
For a class file generated for this simple program:
public class Main {
public static void main(String[] args) {
System.out.println("Hello, World!");
}
}
the script produces:
{'access_flags': ['ACC_FINAL',
'ACC_INTERFACE',
'ACC_ABSTRACT',
'ACC_SYNTHETIC',
'ACC_ANNOTATION',
'ACC_ENUM'],
'attributes': [{'attribute_length': 2,
'attribute_name_index': 13,
'info': b'\x00\x0e'}],
'attributes_count': 1,
'constant_pool': [{'class_index': 6, 'name_and_type_index': 15, 'tag': 10},
{'class_index': 16, 'name_and_type_index': 17, 'tag': 9},
{'name_index': 18, 'tag': 8},
{'class_index': 19, 'name_and_type_index': 20, 'tag': 10},
{'name_index': 21, 'tag': 7},
{'name_index': 22, 'tag': 7},
{'bytes': b'<init>', 'length': 6, 'tag': 1},
{'bytes': b'()V', 'length': 3, 'tag': 1},
{'bytes': b'Code', 'length': 4, 'tag': 1},
{'bytes': b'LineNumberTable', 'length': 15, 'tag': 1},
{'bytes': b'main', 'length': 4, 'tag': 1},
{'bytes': b'([Ljava/lang/String;)V', 'length': 22, 'tag': 1},
{'bytes': b'SourceFile', 'length': 10, 'tag': 1},
{'bytes': b'Main.java', 'length': 9, 'tag': 1},
{'descriptor_index': 8, 'name_index': 7, 'tag': 12},
{'name_index': 23, 'tag': 7},
{'descriptor_index': 25, 'name_index': 24, 'tag': 12},
{'bytes': b'Hello, World!', 'length': 13, 'tag': 1},
{'name_index': 26, 'tag': 7},
{'descriptor_index': 28, 'name_index': 27, 'tag': 12},
{'bytes': b'Main', 'length': 4, 'tag': 1},
{'bytes': b'java/lang/Object', 'length': 16, 'tag': 1},
{'bytes': b'java/lang/System', 'length': 16, 'tag': 1},
{'bytes': b'out', 'length': 3, 'tag': 1},
{'bytes': b'Ljava/io/PrintStream;', 'length': 21, 'tag': 1},
{'bytes': b'java/io/PrintStream', 'length': 19, 'tag': 1},
{'bytes': b'println', 'length': 7, 'tag': 1},
{'bytes': b'(Ljava/lang/String;)V', 'length': 21, 'tag': 1}],
'constant_pool_count': 29,
'fields': [],
'fields_count': 0,
'interfaces': [],
'interfaces_count': 0,
'magic': '0XCAFEBABE',
'major': 55,
'methods': [{'access_flags': ['ACC_PRIVATE',
'ACC_PROTECTED',
'ACC_STATIC',
'ACC_FINAL',
'ACC_SYNCHRONIZED',
'ACC_BRIDGE',
'ACC_VARARGS',
'ACC_NATIVE',
'ACC_ABSTRACT',
'ACC_STRICT',
'ACC_SYNTHETIC'],
'attributes': [{'attribute_length': 29,
'attribute_name_index': 9,
'info': b'\x00\x01\x00\x01\x00\x00\x00\x05'
b'*\xb7\x00\x01\xb1\x00\x00\x00'
b'\x01\x00\n\x00\x00\x00\x06\x00'
b'\x01\x00\x00\x00\x01'}],
'attributes_count': 1,
'descriptor_index': 8,
'name_index': 7},
{'access_flags': ['ACC_PRIVATE',
'ACC_PROTECTED',
'ACC_FINAL',
'ACC_SYNCHRONIZED',
'ACC_BRIDGE',
'ACC_VARARGS',
'ACC_NATIVE',
'ACC_ABSTRACT',
'ACC_STRICT',
'ACC_SYNTHETIC'],
'attributes': [{'attribute_length': 37,
'attribute_name_index': 9,
'info': b'\x00\x02\x00\x01\x00\x00\x00\t'
b'\xb2\x00\x02\x12\x03\xb6\x00\x04'
b'\xb1\x00\x00\x00\x01\x00\n\x00'
b'\x00\x00\n\x00\x02\x00\x00\x00'
b'\x03\x00\x08\x00\x04'}],
'attributes_count': 1,
'descriptor_index': 12,
'name_index': 11}],
'methods_count': 2,
'minor': 0,
'super_class': 6,
'this_class': 5}
Code:
#!/usr/bin/env python3
from enum import Enum
from io import BytesIO
from pathlib import Path
from pprint import pprint
import typer
# fmt: off
# This got rather duplicative.
class Constants(Enum):
CONSTANT_Class = 7
CONSTANT_Fieldref = 9
CONSTANT_Methodref = 10
CONSTANT_InterfaceMethodref = 11
CONSTANT_String = 8
CONSTANT_Integer = 3
CONSTANT_Float = 4
CONSTANT_Long = 5
CONSTANT_Double = 6
CONSTANT_NameAndType = 12
CONSTANT_Utf8 = 1
CONSTANT_MethodHandle = 15
CONSTANT_MethodType = 16
CONSTANT_InvokeDynamic = 18
ACCESS_FLAGS = {
"class": [
("ACC_PUBLIC" ,0x0001),
("ACC_FINAL" ,0x0010),
("ACC_SUPER" ,0x0020),
("ACC_INTERFACE" ,0x0200),
("ACC_ABSTRACT" ,0x0400),
("ACC_SYNTHETIC" ,0x1000),
("ACC_ANNOTATION" ,0x2000),
("ACC_ENUM" ,0x4000),
],
"field": [
("ACC_PUBLIC" ,0x0001),
("ACC_PRIVATE" ,0x0002),
("ACC_PROTECTED" ,0x0004),
("ACC_STATIC" ,0x0008),
("ACC_FINAL" ,0x0010),
("ACC_VOLATILE" ,0x0040),
("ACC_TRANSIENT" ,0x0080),
("ACC_SYNTHETIC" ,0x1000),
("ACC_ENUM" ,0x4000),
],
"method": [
("ACC_PUBLIC" ,0x0001),
("ACC_PRIVATE" ,0x0002),
("ACC_PROTECTED" ,0x0004),
("ACC_STATIC" ,0x0008),
("ACC_FINAL" ,0x0010),
("ACC_SYNCHRONIZED" ,0x0020),
("ACC_BRIDGE" ,0x0040),
("ACC_VARARGS" ,0x0080),
("ACC_NATIVE" ,0x0100),
("ACC_ABSTRACT" ,0x0400),
("ACC_STRICT" ,0x0800),
("ACC_SYNTHETIC" ,0x1000),
],
}
# fmt: on
def parse_ux(file: BytesIO, length: int) -> int:
return int.from_bytes(file.read(length), "big")
def parse_u1(file: BytesIO) -> int:
return parse_ux(file, 1)
def parse_u2(file: BytesIO) -> int:
return parse_ux(file, 2)
def parse_u4(file: BytesIO) -> int:
return parse_ux(file, 4)
def parse_constant_pool(f: BytesIO, pool_size: int) -> int:
constant_pool = []
# We could map each constant tag to its corresponding processing logic.
# Would that be better? This looks horrendous.
for _ in range(pool_size):
cp_info = {}
tag = parse_u1(f)
constant = Constants(tag)
if constant in (
Constants.CONSTANT_Methodref,
Constants.CONSTANT_InterfaceMethodref,
Constants.CONSTANT_Fieldref,
):
cp_info["tag"] = constant.value
cp_info["class_index"] = parse_u2(f)
cp_info["name_and_type_index"] = parse_u2(f)
elif constant in (Constants.CONSTANT_Class, Constants.CONSTANT_String):
cp_info["tag"] = constant.value
cp_info["name_index"] = parse_u2(f)
elif constant == Constants.CONSTANT_Utf8:
cp_info["tag"] = constant.value
cp_info["length"] = parse_u2(f)
cp_info["bytes"] = f.read(cp_info["length"])
elif constant == Constants.CONSTANT_NameAndType:
cp_info["tag"] = constant.value
cp_info["name_index"] = parse_u2(f)
cp_info["descriptor_index"] = parse_u2(f)
elif constant in (Constants.CONSTANT_Integer, Constants.CONSTANT_Float):
cp_info["tag"] = constant.value
cp_info["bytes"] = f.read(4)
elif constant in (Constants.CONSTANT_Long, Constants.CONSTANT_Double):
cp_info["tag"] = constant.value
cp_info["high_bytes"] = f.read(4)
cp_info["low_bytes"] = f.read(4)
elif constant == Constants.CONSTANT_MethodHandle:
cp_info["tag"] = constant.value
cp_info["reference_kind"] = parse_u1(f)
cp_info["reference_index"] = parse_u2(f)
elif constant == Constants.CONSTANT_MethodType:
cp_info["tag"] = constant.value
cp_info["descriptor_index"] = parse_u2(f)
elif constant == Constants.CONSTANT_InvokeDynamic:
cp_info["tag"] = constant.value
cp_info["bootstrap_method_attr_index"] = parse_u2(f)
cp_info["name_and_type_index"] = parse_u2(f)
else:
assert False, f"Unexpected tag encountered {tag = }"
constant_pool.append(cp_info)
return constant_pool
def parse_access_flags(val: int, flags: [(str, int)]) -> list[str]:
return [name for (name, mask) in flags if not (val & mask)]
def parse_attributes(f: BytesIO, attributes_count: int) -> list:
attributes = []
for _ in range(attributes_count):
attribute_info = {}
attribute_info["attribute_name_index"] = parse_u2(f)
attribute_info["attribute_length"] = parse_u4(f)
attribute_info["info"] = f.read(attribute_info["attribute_length"])
attributes.append(attribute_info)
return attributes
def parse_methods(f: BytesIO, methods_count: int) -> list:
methods = []
for _ in range(methods_count):
method_info = {}
method_info["access_flags"] = parse_access_flags(
parse_u2(f), ACCESS_FLAGS["method"]
)
method_info["name_index"] = parse_u2(f)
method_info["descriptor_index"] = parse_u2(f)
method_info["attributes_count"] = parse_u2(f)
method_info["attributes"] = parse_attributes(f, method_info["attributes_count"])
methods.append(method_info)
return methods
def parse_fields(f: BytesIO, fields_count: int) -> dict:
fields = []
for _ in range(fields_count):
field_info = {}
field_info["access_flags"] = parse_access_flags(
parse_u2(f), ACCESS_FLAGS["field"]
)
field_info["name_index"] = parse_u2(f)
field_info["descriptor_index"] = parse_u2(f)
field_info["attributes_count"] = parse_u2(f)
field_info["attributes"] = parse_attributes(f, field_info["attributes_count"])
fields.append(field_info)
return fields
def parse_interfaces(f: BytesIO, interfaces_count: int) -> dict:
interfaces = []
for _ in range(interfaces_count):
parse_u1(f) # Discard tag
class_info = {"tag": "CONSTANT_Class", "name_index": parse_u2()}
interfaces.append(class_info)
return interfaces
def parse_class_file(f: BytesIO) -> dict:
class_file = {}
class_file["magic"] = str(hex(parse_u4(f))).upper()
class_file["minor"] = parse_u2(f)
class_file["major"] = parse_u2(f)
class_file["constant_pool_count"] = parse_u2(f)
class_file["constant_pool"] = parse_constant_pool(
f, class_file["constant_pool_count"] - 1
)
class_file["access_flags"] = parse_access_flags(parse_u2(f), ACCESS_FLAGS["class"])
class_file["this_class"] = parse_u2(f)
class_file["super_class"] = parse_u2(f)
class_file["interfaces_count"] = parse_u2(f)
class_file["interfaces"] = parse_interfaces(f, class_file["interfaces_count"])
class_file["fields_count"] = parse_u2(f)
class_file["fields"] = parse_fields(f, class_file["fields_count"])
class_file["methods_count"] = parse_u2(f)
class_file["methods"] = parse_methods(f, class_file["methods_count"])
class_file["attributes_count"] = parse_u2(f)
class_file["attributes"] = parse_attributes(f, class_file["attributes_count"])
return class_file
def main(file_path: Path) -> None:
with open(file_path, mode="rb") as f:
class_file = parse_class_file(BytesIO(f.read()))
pprint(class_file)
if __name__ == "__main__":
typer.run(main)
Review Request:
Bugs, general coding comments, style, idiomatic code, et cetera.
PS: This was done as a recreational activity.
-
\$\begingroup\$ It's fine that it was a recreational activity; but is this the end purpose? If not, how is the parsed struct then used? \$\endgroup\$Reinderien– Reinderien2024年02月23日 12:54:34 +00:00Commented Feb 23, 2024 at 12:54
-
\$\begingroup\$ Well, I might someday parse the bytecode too to run a small hello world program, but for now, this is the finished program, yes. \$\endgroup\$Madagascar– Madagascar2024年02月23日 12:56:45 +00:00Commented Feb 23, 2024 at 12:56
-
\$\begingroup\$ "In real life" you definitely don't want to parse byte code to run it; you need to call into an FFI. There are many options. \$\endgroup\$Reinderien– Reinderien2024年02月23日 13:00:17 +00:00Commented Feb 23, 2024 at 13:00
1 Answer 1
Your reference is extremely out-of-date; refer to version 21. Luckily the JVM hasn't changed much.
Typer seems like overkill for a program that unconditionally accepts one command-line argument. I scarcely consider that justification for bringing in a third-party library.
Your Constants
shouldn't be an Enum
; it should be an IntEnum
. Your ACCESS_FLAGS
should not be a dict of lists; it should be split out into separate IntFlag
s.
When you print the constant tag, don't print the number; print the symbol. repr
(!r
) will do this.
I consider int.from_bytes
and the variable-length method used in parse_ux
to be less explicit than the other two options I'll be demonstrating, which are struct
unpacking and ctypes
unpacking. Your parse_fields
and similar methods should be entirely replaced with big-endian structure definitions.
Don't use dictionaries for internal program data; they aren't well-typed.
Your script will not be very useful until you resolve the constant indices to their respective structures. For instance, your output 'attribute_name_index': 9
would be replaced with a reference to the corresponding constant string.
Replace open(file_path, mode="rb")
with file_path.open()
.
It's actually a pretty reasonable idea to in-memory buffer the file content before deserialising it, and may have performance advantages; but for simplicity I do not include this in my demonstration.
Suggested
The following is a little long-winded, but demonstrates some of the concepts I've talked about above. It has nearly mypy-compliant types, save for the functional enums that mypy does not support.
#!/usr/bin/env python3
import ctypes
import struct
import sys
from dataclasses import dataclass
from enum import IntEnum, IntFlag
from functools import partial
from io import BufferedIOBase
from itertools import chain
from pathlib import Path
from typing import Callable, ClassVar, Iterator, NamedTuple, Type, TypeVar
# Spec from
# https://docs.oracle.com/javase/specs/jvms/se21/html/jvms-4.html
# Since we don't require strict validation, this captures all flags that don't
# have multiple definitions.
ACCESS_SHARED = {
'PUBLIC' : 0x0001,
'PRIVATE' : 0x0002,
'PROTECTED' : 0x0004,
'STATIC' : 0x0008,
'FINAL' : 0x0010,
'NATIVE' : 0x0100,
'INTERFACE' : 0x0200,
'ABSTRACT' : 0x0400,
'STRICT' : 0x0800,
'SYNTHETIC' : 0x1000,
'ANNOTATION': 0x2000,
'ENUM' : 0x4000,
}
# This functional enum form is not mypy-compatible.
CommonAccess = IntFlag('CommonAccess', ACCESS_SHARED)
ClassAccess = IntFlag('ClassAccess', {
**ACCESS_SHARED,
'SUPER': 0x0020,
'MODULE': 0x8000,
})
MethodAccess = IntFlag('MethodAccess', {
**ACCESS_SHARED,
'SYNCHRONIZED' : 0x0020,
'BRIDGE' : 0x0040,
'VARARGS' : 0x0080,
})
ParameterAccess = IntFlag('ParameterAccess', {
**ACCESS_SHARED,
'MANDATED' : 0x8000,
})
ModuleAccess = IntFlag('ModuleAccess', {
**ACCESS_SHARED,
'OPEN' : 0x0020,
'MANDATED' : 0x8000,
})
ModuleRequiresAccess = IntFlag('ModuleRequiresAccess', {
**ACCESS_SHARED,
'TRANSITIVE' : 0x0020,
'STATIC_PHASE' : 0x0040,
'MANDATED' : 0x8000,
})
FieldAccess = IntFlag('FieldAccess', {
**ACCESS_SHARED,
'VOLATILE' : 0x0040,
'TRANSIENT' : 0x0080,
})
class ConstantTag(IntEnum):
UTF8 = 1
INTEGER = 3
FLOAT = 4
LONG = 5
DOUBLE = 6
CLASS = 7
STRING = 8
FIELD_REF = 9
METHOD_REF = 10
INTERFACE_METHOD_REF = 11
NAME_AND_TYPE = 12
METHOD_HANDLE = 15
METHOD_TYPE = 16
DYNAMIC = 17
INVOKE_DYNAMIC = 18
MODULE = 19
PACKAGE = 20
class ReferenceKind(IntEnum):
getField = 1
getStatic = 2
putField = 3
putStatic = 4
invokeVirtual = 5
invokeStatic = 6
invokeSpecial = 7
newInvokeSpecial = 8
invokeInterface = 9
class Version(ctypes.BigEndianStructure):
_pack_ = 1
_fields_ = (
('magic', ctypes.c_uint32),
('minor_version', ctypes.c_uint16),
('major_version', ctypes.c_uint16),
)
__slots__ = [k for k, t in _fields_]
class Constant:
CHILDREN: ClassVar[tuple[str, ...]]
class ClassConstant(Constant, ctypes.BigEndianStructure):
_pack_ = 1
_fields_ = (
('name_index', ctypes.c_uint16),
)
CHILDREN = 'name_index',
__slots__ = ('name_index', 'name_constant')
def __str__(self) -> str:
return str(self.name_constant)
ModuleConstant = ClassConstant
PackageConstant = ClassConstant
class DoubleConstant(Constant, ctypes.BigEndianStructure):
_pack_ = 1
_fields_ = (
# Don't represent as high bytes and low bytes in the spec;
# directly unpack to value
('value', ctypes.c_double),
)
CHILDREN = ()
__slots__ = 'value',
def __str__(self) -> str:
return str(self.value)
class DynamicConstant(Constant, ctypes.BigEndianStructure):
_pack_ = 1
_fields_ = (
('bootstrap_method_attr_index', ctypes.c_uint16),
('name_and_type_index', ctypes.c_uint16),
)
CHILDREN = ('bootstrap_method_attr_index', 'name_and_type_index')
__slots__ = (
'bootstrap_method_attr_index', 'bootstrap_method_attr_constant',
'name_and_type_index', 'name_and_type_constant',
)
def __str__(self) -> str:
return f'{self.name_and_type_constant} -> {self.bootstrap_method_attr_constant}'
class FloatConstant(Constant, ctypes.BigEndianStructure):
_pack_ = 1
_fields_ = (
('value', ctypes.c_float),
)
CHILDREN = ()
__slots__ = 'value',
def __str__(self) -> str:
return str(self.value)
class IntegerConstant(Constant, ctypes.BigEndianStructure):
_pack_ = 1
_fields_ = (
('value', ctypes.c_int32),
)
CHILDREN = ()
__slots__ = 'value',
def __str__(self) -> str:
return str(self.value)
class LongConstant(Constant, ctypes.BigEndianStructure):
_pack_ = 1
_fields_ = (
('value', ctypes.c_int64),
)
CHILDREN = ()
__slots__ = 'value',
def __str__(self) -> str:
return str(self.value)
InvokeDynamicConstant = DynamicConstant
class MethodHandleConstant(Constant, ctypes.BigEndianStructure):
_pack_ = 1
_fields_ = (
('reference_kind', ctypes.c_uint8),
('reference_index', ctypes.c_uint16),
)
CHILDREN = 'reference_index',
__slots__ = (
'reference_kind',
'reference_index', 'reference_constant',
)
@property
def kind(self) -> ReferenceKind:
return ReferenceKind(self.reference_kind)
def __str__(self) -> str:
return f'{self.kind.name} {self.reference_constant}'
class MethodRefConstant(Constant, ctypes.BigEndianStructure):
_pack_ = 1
_fields_ = (
('class_index', ctypes.c_uint16),
('name_and_type_index', ctypes.c_uint16),
)
CHILDREN = ('class_index', 'name_and_type_index')
__slots__ = (
'class_index', 'class_constant',
'name_and_type_index', 'name_and_type_constant',
)
def __str__(self) -> str:
return str(self.name_and_type_constant)
class MethodTypeConstant(Constant, ctypes.BigEndianStructure):
_pack_ = 1
_fields_ = (
('descriptor_index', ctypes.c_uint16),
)
CHILDREN = 'descriptor_index',
__slots__ = ('descriptor_index', 'descriptor_constant')
def __str__(self) -> str:
return str(self.descriptor_constant)
FieldRefConstant = MethodRefConstant
InterfaceMethodConstant = MethodRefConstant
class NameAndTypeConstant(Constant, ctypes.BigEndianStructure):
_pack_ = 1
_fields_ = (
('name_index', ctypes.c_uint16),
('descriptor_index', ctypes.c_uint16),
)
CHILDREN = ('name_index', 'descriptor_index')
__slots__ = (
'name_index', 'name_constant',
'descriptor_index', 'descriptor_constant',
)
def __str__(self) -> str:
return f'{self.name_constant} "{self.descriptor_constant}"'
class StringConstant(Constant, ctypes.BigEndianStructure):
_pack_ = 1
_fields_ = (
('string_index', ctypes.c_uint16),
)
CHILDREN = 'string_index',
__slots__ = ('string_index', 'string_constant')
def __str__(self) -> str:
return str(self.string_constant)
class AttributeInfo(Constant, ctypes.BigEndianStructure):
_pack_ = 1
_fields_ = (
('attribute_name_index', ctypes.c_uint16),
('attribute_length', ctypes.c_uint32),
)
data: bytes
CHILDREN = 'attribute_name_index',
__slots__ = (
'attribute_name_index', 'attribute_name_constant',
'attribute_length', 'data',
)
def __str__(self) -> str:
return f'{self.attribute_name_constant}'
class FieldInfo(Constant, ctypes.BigEndianStructure):
_pack_ = 1
_fields_ = (
('access_flags', ctypes.c_uint16),
('name_index', ctypes.c_uint16),
('descriptor_index', ctypes.c_uint16),
('attributes_count', ctypes.c_uint16),
)
CHILDREN = ('name_index', 'descriptor_index')
attributes: tuple[AttributeInfo, ...]
__slots__ = (
'name_index', 'name_constant',
'descriptor_index', 'descriptor_constant',
'access_flags', 'attributes_count',
'attributes',
)
@property
def access(self) -> FieldAccess:
return FieldAccess(self.access_flags)
def __str__(self) -> str:
s = f'{self.access!r} {self.name_constant} "{self.descriptor_constant}"'
attrs = ', '.join(str(a) for a in self.attributes)
if attrs:
s += ' @ ' + attrs
return s
class MethodInfo(Constant, ctypes.BigEndianStructure):
_pack_ = 1
_fields_ = (
('access_flags', ctypes.c_uint16),
('name_index', ctypes.c_uint16),
('descriptor_index', ctypes.c_uint16),
('attributes_count', ctypes.c_uint16),
)
CHILDREN = ('name_index', 'descriptor_index')
attributes: tuple[AttributeInfo, ...]
__slots__ = (
'name_index', 'name_constant',
'descriptor_index', 'descriptor_constant',
'access_flags', 'attributes_count',
'attributes',
)
@property
def access(self) -> MethodAccess:
return MethodAccess(self.access_flags)
def __str__(self) -> str:
s = f'{self.access!r} {self.name_constant} "{self.descriptor_constant}"'
attrs = ', '.join(str(a) for a in self.attributes)
if attrs:
s += ' @ ' + attrs
return s
@dataclass(frozen=True, slots=True)
class UTF8Constant(Constant):
length: int
bytes_: bytes
CHILDREN = ()
@classmethod
def read(cls, f: BufferedIOBase) -> 'UTF8Constant':
length = read_short(f)
bytes_ = f.read(length)
return cls(length=length, bytes_=bytes_)
def __str__(self) -> str:
return self.bytes_.decode(encoding='utf8')
StructT = TypeVar('StructT', bound=ctypes.BigEndianStructure)
def read_struct(f: BufferedIOBase, type_: Type[StructT]) -> StructT:
value = type_()
f.readinto(value)
return value
def read_short(f: BufferedIOBase) -> int:
fmt = '>H'
buffer = f.read(struct.calcsize(fmt))
value, = struct.unpack(fmt, buffer)
return value
def read_indices(f: BufferedIOBase, n: int) -> tuple[int, ...]:
fmt = f'>{n}H'
buffer = f.read(struct.calcsize(fmt))
return struct.unpack(fmt, buffer)
def bind_read(type_: Type[StructT]) -> Callable[[BufferedIOBase], StructT]:
return partial(read_struct, type_=type_)
CONSTANT_READERS = {
ConstantTag.CLASS: bind_read(ClassConstant),
ConstantTag.DOUBLE: bind_read(DoubleConstant),
ConstantTag.DYNAMIC: bind_read(DynamicConstant),
ConstantTag.FIELD_REF: bind_read(FieldRefConstant),
ConstantTag.FLOAT: bind_read(FloatConstant),
ConstantTag.INTEGER: bind_read(IntegerConstant),
ConstantTag.INTERFACE_METHOD_REF: bind_read(InterfaceMethodConstant),
ConstantTag.INVOKE_DYNAMIC: bind_read(InvokeDynamicConstant),
ConstantTag.LONG: bind_read(LongConstant),
ConstantTag.METHOD_REF: bind_read(MethodRefConstant),
ConstantTag.METHOD_HANDLE: bind_read(MethodHandleConstant),
ConstantTag.METHOD_TYPE: bind_read(MethodTypeConstant),
ConstantTag.MODULE: bind_read(ModuleConstant),
ConstantTag.NAME_AND_TYPE: bind_read(NameAndTypeConstant),
ConstantTag.PACKAGE: bind_read(PackageConstant),
ConstantTag.STRING: bind_read(StringConstant),
ConstantTag.UTF8: UTF8Constant.read,
}
def generate_constants(f: BufferedIOBase, n: int) -> Iterator[Constant]:
for _ in range(n):
tag_value, = f.read(1)
tag = ConstantTag(tag_value)
yield CONSTANT_READERS[tag](f)
def generate_attrs(f: BufferedIOBase, n: int) -> Iterator[AttributeInfo]:
for _ in range(n):
attr = read_struct(f, AttributeInfo)
attr.data = f.read(attr.attribute_length)
yield attr
class Class(NamedTuple):
major_version: int
minor_version: int
access_flags: ClassAccess
constants: tuple[Constant, ...]
this_class: Constant
super_class: Constant
interfaces: tuple[Constant, ...]
fields: tuple[FieldInfo, ...]
methods: tuple[MethodInfo, ...]
attributes: tuple[AttributeInfo, ...]
@classmethod
def deserialise(cls, f: BufferedIOBase) -> 'Class':
version = read_struct(f=f, type_=Version)
constant_pool_count = read_short(f)
constant_pool = tuple(generate_constants(f, n=constant_pool_count - 1))
access_flags = ClassAccess(read_short(f))
this_class = read_short(f)
super_class = read_short(f)
interfaces_count = read_short(f)
interfaces = read_indices(f=f, n=interfaces_count)
fields_count = read_short(f)
fields = [
(
field := read_struct(f, FieldInfo),
tuple(generate_attrs(f, field.attributes_count)),
)
for _ in range(fields_count)
]
methods_count = read_short(f)
methods = [
(
method := read_struct(f, MethodInfo),
tuple(generate_attrs(f, method.attributes_count)),
)
for _ in range(methods_count)
]
attributes_count = read_short(f)
attributes = tuple(generate_attrs(f, attributes_count))
trailing = len(f.read())
if trailing != 0:
raise ValueError(f'{trailing} trailing bytes after deserialise')
return cls._traverse(
version=version, constants=constant_pool, access_flags=access_flags,
this_idx=this_class, interfaces=interfaces,
super_idx=super_class, attributes=attributes,
fields=fields, methods=methods,
)
@classmethod
def _traverse(
cls,
version: Version,
constants: tuple[Constant, ...],
access_flags: ClassAccess,
this_idx: int,
super_idx: int,
interfaces: tuple[int, ...],
fields: list[
tuple[
FieldInfo,
tuple[AttributeInfo, ...],
]
],
methods: list[
tuple[
MethodInfo,
tuple[AttributeInfo, ...],
]
],
attributes: tuple[AttributeInfo, ...],
) -> 'Class':
field_constants = [field[0] for field in fields]
method_constants = [method[0] for method in methods]
all_nodes: tuple[Constant, ...] = (
*constants,
*field_constants,
*method_constants,
*chain.from_iterable(field[1] for field in fields),
*chain.from_iterable(method[1] for method in methods),
*attributes,
)
for constant in all_nodes:
for child_name in constant.CHILDREN:
varname = child_name.removesuffix('_index') + '_constant'
child = constants[getattr(constant, child_name)]
setattr(constant, varname, child)
for field, attrs in fields:
field.attributes = attrs
for method, attrs in methods:
method.attributes = attrs
return cls(
major_version=version.major_version,
minor_version=version.minor_version,
access_flags=access_flags,
constants=constants,
this_class=constants[this_idx],
super_class=constants[super_idx],
interfaces=tuple(
constants[idx] for idx in interfaces
),
fields=tuple(field_constants),
methods=tuple(method_constants),
attributes=attributes,
)
def dump(self, verbose: bool = False) -> Iterator[str]:
yield f'Version {self.major_version}.{self.minor_version}'
yield f'Class: {self.this_class}'
yield f'Super: {self.super_class}'
yield f'Access flags: {self.access_flags!r}'
yield 'Attributes:'
for attr in self.attributes:
yield f' {attr}'
yield 'Fields:'
for field in self.fields:
yield f' {field}'
yield 'Methods:'
for method in self.methods:
yield f' {method}'
yield 'Interfaces:'
for iface in self.interfaces:
yield f' {iface}'
if verbose:
yield 'Constant pool:'
for constant in self.constants:
yield f' {constant}'
def main() -> None:
_, file_path = sys.argv
with Path(file_path).open(mode='rb') as f:
class_ = Class.deserialise(f)
print('\n'.join(class_.dump(verbose=True)))
if __name__ == '__main__':
main()
Output (simple example)
Version 65.0
Class: I "comparator"
Super: ()V "TopByOrder"
Access flags: <ClassAccess.SUPER|PUBLIC: 33>
Attributes:
Ljava/util/Comparator<TE;>;
TopByOrder.java
InnerClasses
CountingComparator
Fields:
<FieldAccess.FINAL|PUBLIC: 17> I "comparator"
<FieldAccess.FINAL|PUBLIC: 17> Ljava/util/Comparator; "(ILjava/util/Comparator;)V"Ljava/util/Comparator<TE;>;
Methods:
<MethodAccess.PUBLIC: 1> ()V "java/util/Collection" @ LineNumberTable, Ljava/util/Comparator<TE;>;
<MethodAccess.PUBLIC: 1> (Ljava/util/Collection;)Ljava/util/PriorityQueue; "java/util/Collection" @ LineNumberTable, Ljava/util/Comparator<TE;>;
<MethodAccess.STATIC|PUBLIC: 9> ([Ljava/lang/String;)V "Ljava/lang/String;" @ LineNumberTable
Output (more complex example)
This one has Dynamics.
Version 65.0
Class: (Ljava/util/List;)Ljava/util/List; "(Ljava/util/List;)V"
Super: ()V "java/lang/Object"
Access flags: <ClassAccess.SUPER|FINAL|PUBLIC: 49>
Attributes:
MultipleGroupPermuterDemo.java
<ReferenceKind.invokeVirtual: 5> (Ljava/lang/Integer;)I "<ReferenceKind.invokeStatic: 6> (Ljava/lang/invoke/MethodHandles$Lookup;Ljava/lang/String;Ljava/lang/invoke/MethodType;Ljava/lang/String;[Ljava/lang/Object;)Ljava/lang/invoke/CallSite;"
Lookup
Fields:
Methods:
<MethodAccess.PUBLIC: 1> ()V "java/lang/Object" @ LineNumberTable
<MethodAccess.STATIC|PUBLIC: 9> ([Ljava/lang/String;)V "groupPermutation" @ LineNumberTable
<MethodAccess.STATIC|PRIVATE: 10> (Ljava/util/Map;)I "makeConcatWithConstants -> <init>" @ LineNumberTable, <T:Ljava/lang/Object;>(Ljava/util/Map<Ljava/util/List<Ljava/util/List<TT;>;>;Ljava/lang/Integer;>;)I
<MethodAccess.STATIC|PRIVATE: 10> (Ljava/util/List;)Ljava/lang/String; "java/io/PrintStream" @ LineNumberTable, <T:Ljava/lang/Object;>(Ljava/util/Map<Ljava/util/List<Ljava/util/List<TT;>;>;Ljava/lang/Integer;>;)I
<MethodAccess.STATIC|PRIVATE: 10> format "java/io/PrintStream" @ LineNumberTable, <T:Ljava/lang/Object;>(Ljava/util/Map<Ljava/util/List<Ljava/util/List<TT;>;>;Ljava/lang/Integer;>;)I
<MethodAccess.STATIC|PRIVATE: 10> (Ljava/util/List;)Ljava/util/List; "(Ljava/util/List;)V" @ LineNumberTable, <T:Ljava/lang/Object;>(Ljava/util/Map<Ljava/util/List<Ljava/util/List<TT;>;>;Ljava/lang/Integer;>;)I
<MethodAccess.STATIC|PRIVATE: 10> java/lang/System "computeGroupPermutations" @ LineNumberTable, <T:Ljava/lang/Object;>(Ljava/util/Map<Ljava/util/List<Ljava/util/List<TT;>;>;Ljava/lang/Integer;>;)I