3
\$\begingroup\$

The structure of a class file consists of a single structure (presented here using pseudostructures written in a C-like structure notation):

ClassFile {
 u4 magic;
 u2 minor_version;
 u2 major_version;
 u2 constant_pool_count;
 cp_info constant_pool[constant_pool_count-1];
 u2 access_flags;
 u2 this_class;
 u2 super_class;
 u2 interfaces_count;
 u2 interfaces[interfaces_count];
 u2 fields_count;
 field_info fields[fields_count];
 u2 methods_count;
 method_info methods[methods_count];
 u2 attributes_count;
 attribute_info attributes[attributes_count];
}

The script parses the class file into a dictionary, except for the attributes, but does not verify that the file is valid/correct. (I did not deem that a good use of my time)

For a class file generated for this simple program:

public class Main {
 public static void main(String[] args) {
 System.out.println("Hello, World!");
 }
}

the script produces:

{'access_flags': ['ACC_FINAL',
 'ACC_INTERFACE',
 'ACC_ABSTRACT',
 'ACC_SYNTHETIC',
 'ACC_ANNOTATION',
 'ACC_ENUM'],
 'attributes': [{'attribute_length': 2,
 'attribute_name_index': 13,
 'info': b'\x00\x0e'}],
 'attributes_count': 1,
 'constant_pool': [{'class_index': 6, 'name_and_type_index': 15, 'tag': 10},
 {'class_index': 16, 'name_and_type_index': 17, 'tag': 9},
 {'name_index': 18, 'tag': 8},
 {'class_index': 19, 'name_and_type_index': 20, 'tag': 10},
 {'name_index': 21, 'tag': 7},
 {'name_index': 22, 'tag': 7},
 {'bytes': b'<init>', 'length': 6, 'tag': 1},
 {'bytes': b'()V', 'length': 3, 'tag': 1},
 {'bytes': b'Code', 'length': 4, 'tag': 1},
 {'bytes': b'LineNumberTable', 'length': 15, 'tag': 1},
 {'bytes': b'main', 'length': 4, 'tag': 1},
 {'bytes': b'([Ljava/lang/String;)V', 'length': 22, 'tag': 1},
 {'bytes': b'SourceFile', 'length': 10, 'tag': 1},
 {'bytes': b'Main.java', 'length': 9, 'tag': 1},
 {'descriptor_index': 8, 'name_index': 7, 'tag': 12},
 {'name_index': 23, 'tag': 7},
 {'descriptor_index': 25, 'name_index': 24, 'tag': 12},
 {'bytes': b'Hello, World!', 'length': 13, 'tag': 1},
 {'name_index': 26, 'tag': 7},
 {'descriptor_index': 28, 'name_index': 27, 'tag': 12},
 {'bytes': b'Main', 'length': 4, 'tag': 1},
 {'bytes': b'java/lang/Object', 'length': 16, 'tag': 1},
 {'bytes': b'java/lang/System', 'length': 16, 'tag': 1},
 {'bytes': b'out', 'length': 3, 'tag': 1},
 {'bytes': b'Ljava/io/PrintStream;', 'length': 21, 'tag': 1},
 {'bytes': b'java/io/PrintStream', 'length': 19, 'tag': 1},
 {'bytes': b'println', 'length': 7, 'tag': 1},
 {'bytes': b'(Ljava/lang/String;)V', 'length': 21, 'tag': 1}],
 'constant_pool_count': 29,
 'fields': [],
 'fields_count': 0,
 'interfaces': [],
 'interfaces_count': 0,
 'magic': '0XCAFEBABE',
 'major': 55,
 'methods': [{'access_flags': ['ACC_PRIVATE',
 'ACC_PROTECTED',
 'ACC_STATIC',
 'ACC_FINAL',
 'ACC_SYNCHRONIZED',
 'ACC_BRIDGE',
 'ACC_VARARGS',
 'ACC_NATIVE',
 'ACC_ABSTRACT',
 'ACC_STRICT',
 'ACC_SYNTHETIC'],
 'attributes': [{'attribute_length': 29,
 'attribute_name_index': 9,
 'info': b'\x00\x01\x00\x01\x00\x00\x00\x05'
 b'*\xb7\x00\x01\xb1\x00\x00\x00'
 b'\x01\x00\n\x00\x00\x00\x06\x00'
 b'\x01\x00\x00\x00\x01'}],
 'attributes_count': 1,
 'descriptor_index': 8,
 'name_index': 7},
 {'access_flags': ['ACC_PRIVATE',
 'ACC_PROTECTED',
 'ACC_FINAL',
 'ACC_SYNCHRONIZED',
 'ACC_BRIDGE',
 'ACC_VARARGS',
 'ACC_NATIVE',
 'ACC_ABSTRACT',
 'ACC_STRICT',
 'ACC_SYNTHETIC'],
 'attributes': [{'attribute_length': 37,
 'attribute_name_index': 9,
 'info': b'\x00\x02\x00\x01\x00\x00\x00\t'
 b'\xb2\x00\x02\x12\x03\xb6\x00\x04'
 b'\xb1\x00\x00\x00\x01\x00\n\x00'
 b'\x00\x00\n\x00\x02\x00\x00\x00'
 b'\x03\x00\x08\x00\x04'}],
 'attributes_count': 1,
 'descriptor_index': 12,
 'name_index': 11}],
 'methods_count': 2,
 'minor': 0,
 'super_class': 6,
 'this_class': 5}

Code:

#!/usr/bin/env python3
from enum import Enum
from io import BytesIO
from pathlib import Path
from pprint import pprint
import typer
# fmt: off
# This got rather duplicative. 
class Constants(Enum):
 CONSTANT_Class = 7
 CONSTANT_Fieldref = 9
 CONSTANT_Methodref = 10
 CONSTANT_InterfaceMethodref = 11
 CONSTANT_String = 8
 CONSTANT_Integer = 3
 CONSTANT_Float = 4
 CONSTANT_Long = 5
 CONSTANT_Double = 6
 CONSTANT_NameAndType = 12
 CONSTANT_Utf8 = 1
 CONSTANT_MethodHandle = 15
 CONSTANT_MethodType = 16
 CONSTANT_InvokeDynamic = 18
ACCESS_FLAGS = {
 "class": [
 ("ACC_PUBLIC" ,0x0001),
 ("ACC_FINAL" ,0x0010),
 ("ACC_SUPER" ,0x0020),
 ("ACC_INTERFACE" ,0x0200),
 ("ACC_ABSTRACT" ,0x0400),
 ("ACC_SYNTHETIC" ,0x1000),
 ("ACC_ANNOTATION" ,0x2000),
 ("ACC_ENUM" ,0x4000),
 ],
 "field": [
 ("ACC_PUBLIC" ,0x0001),
 ("ACC_PRIVATE" ,0x0002),
 ("ACC_PROTECTED" ,0x0004),
 ("ACC_STATIC" ,0x0008),
 ("ACC_FINAL" ,0x0010),
 ("ACC_VOLATILE" ,0x0040),
 ("ACC_TRANSIENT" ,0x0080),
 ("ACC_SYNTHETIC" ,0x1000),
 ("ACC_ENUM" ,0x4000),
 ],
 "method": [
 ("ACC_PUBLIC" ,0x0001),
 ("ACC_PRIVATE" ,0x0002),
 ("ACC_PROTECTED" ,0x0004),
 ("ACC_STATIC" ,0x0008),
 ("ACC_FINAL" ,0x0010),
 ("ACC_SYNCHRONIZED" ,0x0020),
 ("ACC_BRIDGE" ,0x0040),
 ("ACC_VARARGS" ,0x0080),
 ("ACC_NATIVE" ,0x0100),
 ("ACC_ABSTRACT" ,0x0400),
 ("ACC_STRICT" ,0x0800),
 ("ACC_SYNTHETIC" ,0x1000),
 ],
}
# fmt: on
def parse_ux(file: BytesIO, length: int) -> int:
 return int.from_bytes(file.read(length), "big")
def parse_u1(file: BytesIO) -> int:
 return parse_ux(file, 1)
def parse_u2(file: BytesIO) -> int:
 return parse_ux(file, 2)
def parse_u4(file: BytesIO) -> int:
 return parse_ux(file, 4)
def parse_constant_pool(f: BytesIO, pool_size: int) -> int:
 constant_pool = []
 # We could map each constant tag to its corresponding processing logic.
 # Would that be better? This looks horrendous.
 for _ in range(pool_size):
 cp_info = {}
 tag = parse_u1(f)
 constant = Constants(tag)
 if constant in (
 Constants.CONSTANT_Methodref,
 Constants.CONSTANT_InterfaceMethodref,
 Constants.CONSTANT_Fieldref,
 ):
 cp_info["tag"] = constant.value
 cp_info["class_index"] = parse_u2(f)
 cp_info["name_and_type_index"] = parse_u2(f)
 elif constant in (Constants.CONSTANT_Class, Constants.CONSTANT_String):
 cp_info["tag"] = constant.value
 cp_info["name_index"] = parse_u2(f)
 elif constant == Constants.CONSTANT_Utf8:
 cp_info["tag"] = constant.value
 cp_info["length"] = parse_u2(f)
 cp_info["bytes"] = f.read(cp_info["length"])
 elif constant == Constants.CONSTANT_NameAndType:
 cp_info["tag"] = constant.value
 cp_info["name_index"] = parse_u2(f)
 cp_info["descriptor_index"] = parse_u2(f)
 elif constant in (Constants.CONSTANT_Integer, Constants.CONSTANT_Float):
 cp_info["tag"] = constant.value
 cp_info["bytes"] = f.read(4)
 elif constant in (Constants.CONSTANT_Long, Constants.CONSTANT_Double):
 cp_info["tag"] = constant.value
 cp_info["high_bytes"] = f.read(4)
 cp_info["low_bytes"] = f.read(4)
 elif constant == Constants.CONSTANT_MethodHandle:
 cp_info["tag"] = constant.value
 cp_info["reference_kind"] = parse_u1(f)
 cp_info["reference_index"] = parse_u2(f)
 elif constant == Constants.CONSTANT_MethodType:
 cp_info["tag"] = constant.value
 cp_info["descriptor_index"] = parse_u2(f)
 elif constant == Constants.CONSTANT_InvokeDynamic:
 cp_info["tag"] = constant.value
 cp_info["bootstrap_method_attr_index"] = parse_u2(f)
 cp_info["name_and_type_index"] = parse_u2(f)
 else:
 assert False, f"Unexpected tag encountered {tag = }"
 constant_pool.append(cp_info)
 return constant_pool
def parse_access_flags(val: int, flags: [(str, int)]) -> list[str]:
 return [name for (name, mask) in flags if not (val & mask)]
def parse_attributes(f: BytesIO, attributes_count: int) -> list:
 attributes = []
 for _ in range(attributes_count):
 attribute_info = {}
 attribute_info["attribute_name_index"] = parse_u2(f)
 attribute_info["attribute_length"] = parse_u4(f)
 attribute_info["info"] = f.read(attribute_info["attribute_length"])
 attributes.append(attribute_info)
 return attributes
def parse_methods(f: BytesIO, methods_count: int) -> list:
 methods = []
 for _ in range(methods_count):
 method_info = {}
 method_info["access_flags"] = parse_access_flags(
 parse_u2(f), ACCESS_FLAGS["method"]
 )
 method_info["name_index"] = parse_u2(f)
 method_info["descriptor_index"] = parse_u2(f)
 method_info["attributes_count"] = parse_u2(f)
 method_info["attributes"] = parse_attributes(f, method_info["attributes_count"])
 methods.append(method_info)
 return methods
def parse_fields(f: BytesIO, fields_count: int) -> dict:
 fields = []
 for _ in range(fields_count):
 field_info = {}
 field_info["access_flags"] = parse_access_flags(
 parse_u2(f), ACCESS_FLAGS["field"]
 )
 field_info["name_index"] = parse_u2(f)
 field_info["descriptor_index"] = parse_u2(f)
 field_info["attributes_count"] = parse_u2(f)
 field_info["attributes"] = parse_attributes(f, field_info["attributes_count"])
 fields.append(field_info)
 return fields
def parse_interfaces(f: BytesIO, interfaces_count: int) -> dict:
 interfaces = []
 for _ in range(interfaces_count):
 parse_u1(f) # Discard tag
 class_info = {"tag": "CONSTANT_Class", "name_index": parse_u2()}
 interfaces.append(class_info)
 return interfaces
def parse_class_file(f: BytesIO) -> dict:
 class_file = {}
 class_file["magic"] = str(hex(parse_u4(f))).upper()
 class_file["minor"] = parse_u2(f)
 class_file["major"] = parse_u2(f)
 class_file["constant_pool_count"] = parse_u2(f)
 class_file["constant_pool"] = parse_constant_pool(
 f, class_file["constant_pool_count"] - 1
 )
 class_file["access_flags"] = parse_access_flags(parse_u2(f), ACCESS_FLAGS["class"])
 class_file["this_class"] = parse_u2(f)
 class_file["super_class"] = parse_u2(f)
 class_file["interfaces_count"] = parse_u2(f)
 class_file["interfaces"] = parse_interfaces(f, class_file["interfaces_count"])
 class_file["fields_count"] = parse_u2(f)
 class_file["fields"] = parse_fields(f, class_file["fields_count"])
 class_file["methods_count"] = parse_u2(f)
 class_file["methods"] = parse_methods(f, class_file["methods_count"])
 class_file["attributes_count"] = parse_u2(f)
 class_file["attributes"] = parse_attributes(f, class_file["attributes_count"])
 return class_file
def main(file_path: Path) -> None:
 with open(file_path, mode="rb") as f:
 class_file = parse_class_file(BytesIO(f.read()))
 pprint(class_file)
if __name__ == "__main__":
 typer.run(main)

Review Request:

Bugs, general coding comments, style, idiomatic code, et cetera.

PS: This was done as a recreational activity.

Reinderien
70.9k5 gold badges76 silver badges256 bronze badges
asked Feb 23, 2024 at 8:37
\$\endgroup\$
3
  • \$\begingroup\$ It's fine that it was a recreational activity; but is this the end purpose? If not, how is the parsed struct then used? \$\endgroup\$ Commented Feb 23, 2024 at 12:54
  • \$\begingroup\$ Well, I might someday parse the bytecode too to run a small hello world program, but for now, this is the finished program, yes. \$\endgroup\$ Commented Feb 23, 2024 at 12:56
  • \$\begingroup\$ "In real life" you definitely don't want to parse byte code to run it; you need to call into an FFI. There are many options. \$\endgroup\$ Commented Feb 23, 2024 at 13:00

1 Answer 1

3
\$\begingroup\$

Your reference is extremely out-of-date; refer to version 21. Luckily the JVM hasn't changed much.

Typer seems like overkill for a program that unconditionally accepts one command-line argument. I scarcely consider that justification for bringing in a third-party library.

Your Constants shouldn't be an Enum; it should be an IntEnum. Your ACCESS_FLAGS should not be a dict of lists; it should be split out into separate IntFlags.

When you print the constant tag, don't print the number; print the symbol. repr (!r) will do this.

I consider int.from_bytes and the variable-length method used in parse_ux to be less explicit than the other two options I'll be demonstrating, which are struct unpacking and ctypes unpacking. Your parse_fields and similar methods should be entirely replaced with big-endian structure definitions.

Don't use dictionaries for internal program data; they aren't well-typed.

Your script will not be very useful until you resolve the constant indices to their respective structures. For instance, your output 'attribute_name_index': 9 would be replaced with a reference to the corresponding constant string.

Replace open(file_path, mode="rb") with file_path.open().

It's actually a pretty reasonable idea to in-memory buffer the file content before deserialising it, and may have performance advantages; but for simplicity I do not include this in my demonstration.

Suggested

The following is a little long-winded, but demonstrates some of the concepts I've talked about above. It has nearly mypy-compliant types, save for the functional enums that mypy does not support.

#!/usr/bin/env python3
import ctypes
import struct
import sys
from dataclasses import dataclass
from enum import IntEnum, IntFlag
from functools import partial
from io import BufferedIOBase
from itertools import chain
from pathlib import Path
from typing import Callable, ClassVar, Iterator, NamedTuple, Type, TypeVar
# Spec from
# https://docs.oracle.com/javase/specs/jvms/se21/html/jvms-4.html
# Since we don't require strict validation, this captures all flags that don't
# have multiple definitions.
ACCESS_SHARED = {
 'PUBLIC' : 0x0001,
 'PRIVATE' : 0x0002,
 'PROTECTED' : 0x0004,
 'STATIC' : 0x0008,
 'FINAL' : 0x0010,
 'NATIVE' : 0x0100,
 'INTERFACE' : 0x0200,
 'ABSTRACT' : 0x0400,
 'STRICT' : 0x0800,
 'SYNTHETIC' : 0x1000,
 'ANNOTATION': 0x2000,
 'ENUM' : 0x4000,
}
# This functional enum form is not mypy-compatible.
CommonAccess = IntFlag('CommonAccess', ACCESS_SHARED)
ClassAccess = IntFlag('ClassAccess', {
 **ACCESS_SHARED,
 'SUPER': 0x0020,
 'MODULE': 0x8000,
})
MethodAccess = IntFlag('MethodAccess', {
 **ACCESS_SHARED,
 'SYNCHRONIZED' : 0x0020,
 'BRIDGE' : 0x0040,
 'VARARGS' : 0x0080,
})
ParameterAccess = IntFlag('ParameterAccess', {
 **ACCESS_SHARED,
 'MANDATED' : 0x8000,
})
ModuleAccess = IntFlag('ModuleAccess', {
 **ACCESS_SHARED,
 'OPEN' : 0x0020,
 'MANDATED' : 0x8000,
})
ModuleRequiresAccess = IntFlag('ModuleRequiresAccess', {
 **ACCESS_SHARED,
 'TRANSITIVE' : 0x0020,
 'STATIC_PHASE' : 0x0040,
 'MANDATED' : 0x8000,
})
FieldAccess = IntFlag('FieldAccess', {
 **ACCESS_SHARED,
 'VOLATILE' : 0x0040,
 'TRANSIENT' : 0x0080,
})
class ConstantTag(IntEnum):
 UTF8 = 1
 INTEGER = 3
 FLOAT = 4
 LONG = 5
 DOUBLE = 6
 CLASS = 7
 STRING = 8
 FIELD_REF = 9
 METHOD_REF = 10
 INTERFACE_METHOD_REF = 11
 NAME_AND_TYPE = 12
 METHOD_HANDLE = 15
 METHOD_TYPE = 16
 DYNAMIC = 17
 INVOKE_DYNAMIC = 18
 MODULE = 19
 PACKAGE = 20
class ReferenceKind(IntEnum):
 getField = 1
 getStatic = 2
 putField = 3
 putStatic = 4
 invokeVirtual = 5
 invokeStatic = 6
 invokeSpecial = 7
 newInvokeSpecial = 8
 invokeInterface = 9
class Version(ctypes.BigEndianStructure):
 _pack_ = 1
 _fields_ = (
 ('magic', ctypes.c_uint32),
 ('minor_version', ctypes.c_uint16),
 ('major_version', ctypes.c_uint16),
 )
 __slots__ = [k for k, t in _fields_]
class Constant:
 CHILDREN: ClassVar[tuple[str, ...]]
class ClassConstant(Constant, ctypes.BigEndianStructure):
 _pack_ = 1
 _fields_ = (
 ('name_index', ctypes.c_uint16),
 )
 CHILDREN = 'name_index',
 __slots__ = ('name_index', 'name_constant')
 def __str__(self) -> str:
 return str(self.name_constant)
ModuleConstant = ClassConstant
PackageConstant = ClassConstant
class DoubleConstant(Constant, ctypes.BigEndianStructure):
 _pack_ = 1
 _fields_ = (
 # Don't represent as high bytes and low bytes in the spec;
 # directly unpack to value
 ('value', ctypes.c_double),
 )
 CHILDREN = ()
 __slots__ = 'value',
 def __str__(self) -> str:
 return str(self.value)
class DynamicConstant(Constant, ctypes.BigEndianStructure):
 _pack_ = 1
 _fields_ = (
 ('bootstrap_method_attr_index', ctypes.c_uint16),
 ('name_and_type_index', ctypes.c_uint16),
 )
 CHILDREN = ('bootstrap_method_attr_index', 'name_and_type_index')
 __slots__ = (
 'bootstrap_method_attr_index', 'bootstrap_method_attr_constant',
 'name_and_type_index', 'name_and_type_constant',
 )
 def __str__(self) -> str:
 return f'{self.name_and_type_constant} -> {self.bootstrap_method_attr_constant}'
class FloatConstant(Constant, ctypes.BigEndianStructure):
 _pack_ = 1
 _fields_ = (
 ('value', ctypes.c_float),
 )
 CHILDREN = ()
 __slots__ = 'value',
 def __str__(self) -> str:
 return str(self.value)
class IntegerConstant(Constant, ctypes.BigEndianStructure):
 _pack_ = 1
 _fields_ = (
 ('value', ctypes.c_int32),
 )
 CHILDREN = ()
 __slots__ = 'value',
 def __str__(self) -> str:
 return str(self.value)
class LongConstant(Constant, ctypes.BigEndianStructure):
 _pack_ = 1
 _fields_ = (
 ('value', ctypes.c_int64),
 )
 CHILDREN = ()
 __slots__ = 'value',
 def __str__(self) -> str:
 return str(self.value)
InvokeDynamicConstant = DynamicConstant
class MethodHandleConstant(Constant, ctypes.BigEndianStructure):
 _pack_ = 1
 _fields_ = (
 ('reference_kind', ctypes.c_uint8),
 ('reference_index', ctypes.c_uint16),
 )
 CHILDREN = 'reference_index',
 __slots__ = (
 'reference_kind',
 'reference_index', 'reference_constant',
 )
 @property
 def kind(self) -> ReferenceKind:
 return ReferenceKind(self.reference_kind)
 def __str__(self) -> str:
 return f'{self.kind.name} {self.reference_constant}'
class MethodRefConstant(Constant, ctypes.BigEndianStructure):
 _pack_ = 1
 _fields_ = (
 ('class_index', ctypes.c_uint16),
 ('name_and_type_index', ctypes.c_uint16),
 )
 CHILDREN = ('class_index', 'name_and_type_index')
 __slots__ = (
 'class_index', 'class_constant',
 'name_and_type_index', 'name_and_type_constant',
 )
 def __str__(self) -> str:
 return str(self.name_and_type_constant)
class MethodTypeConstant(Constant, ctypes.BigEndianStructure):
 _pack_ = 1
 _fields_ = (
 ('descriptor_index', ctypes.c_uint16),
 )
 CHILDREN = 'descriptor_index',
 __slots__ = ('descriptor_index', 'descriptor_constant')
 def __str__(self) -> str:
 return str(self.descriptor_constant)
FieldRefConstant = MethodRefConstant
InterfaceMethodConstant = MethodRefConstant
class NameAndTypeConstant(Constant, ctypes.BigEndianStructure):
 _pack_ = 1
 _fields_ = (
 ('name_index', ctypes.c_uint16),
 ('descriptor_index', ctypes.c_uint16),
 )
 CHILDREN = ('name_index', 'descriptor_index')
 __slots__ = (
 'name_index', 'name_constant',
 'descriptor_index', 'descriptor_constant',
 )
 def __str__(self) -> str:
 return f'{self.name_constant} "{self.descriptor_constant}"'
class StringConstant(Constant, ctypes.BigEndianStructure):
 _pack_ = 1
 _fields_ = (
 ('string_index', ctypes.c_uint16),
 )
 CHILDREN = 'string_index',
 __slots__ = ('string_index', 'string_constant')
 def __str__(self) -> str:
 return str(self.string_constant)
class AttributeInfo(Constant, ctypes.BigEndianStructure):
 _pack_ = 1
 _fields_ = (
 ('attribute_name_index', ctypes.c_uint16),
 ('attribute_length', ctypes.c_uint32),
 )
 data: bytes
 CHILDREN = 'attribute_name_index',
 __slots__ = (
 'attribute_name_index', 'attribute_name_constant',
 'attribute_length', 'data',
 )
 def __str__(self) -> str:
 return f'{self.attribute_name_constant}'
class FieldInfo(Constant, ctypes.BigEndianStructure):
 _pack_ = 1
 _fields_ = (
 ('access_flags', ctypes.c_uint16),
 ('name_index', ctypes.c_uint16),
 ('descriptor_index', ctypes.c_uint16),
 ('attributes_count', ctypes.c_uint16),
 )
 CHILDREN = ('name_index', 'descriptor_index')
 attributes: tuple[AttributeInfo, ...]
 __slots__ = (
 'name_index', 'name_constant',
 'descriptor_index', 'descriptor_constant',
 'access_flags', 'attributes_count',
 'attributes',
 )
 @property
 def access(self) -> FieldAccess:
 return FieldAccess(self.access_flags)
 def __str__(self) -> str:
 s = f'{self.access!r} {self.name_constant} "{self.descriptor_constant}"'
 attrs = ', '.join(str(a) for a in self.attributes)
 if attrs:
 s += ' @ ' + attrs
 return s
class MethodInfo(Constant, ctypes.BigEndianStructure):
 _pack_ = 1
 _fields_ = (
 ('access_flags', ctypes.c_uint16),
 ('name_index', ctypes.c_uint16),
 ('descriptor_index', ctypes.c_uint16),
 ('attributes_count', ctypes.c_uint16),
 )
 CHILDREN = ('name_index', 'descriptor_index')
 attributes: tuple[AttributeInfo, ...]
 __slots__ = (
 'name_index', 'name_constant',
 'descriptor_index', 'descriptor_constant',
 'access_flags', 'attributes_count',
 'attributes',
 )
 @property
 def access(self) -> MethodAccess:
 return MethodAccess(self.access_flags)
 def __str__(self) -> str:
 s = f'{self.access!r} {self.name_constant} "{self.descriptor_constant}"'
 attrs = ', '.join(str(a) for a in self.attributes)
 if attrs:
 s += ' @ ' + attrs
 return s
@dataclass(frozen=True, slots=True)
class UTF8Constant(Constant):
 length: int
 bytes_: bytes
 CHILDREN = ()
 @classmethod
 def read(cls, f: BufferedIOBase) -> 'UTF8Constant':
 length = read_short(f)
 bytes_ = f.read(length)
 return cls(length=length, bytes_=bytes_)
 def __str__(self) -> str:
 return self.bytes_.decode(encoding='utf8')
StructT = TypeVar('StructT', bound=ctypes.BigEndianStructure)
def read_struct(f: BufferedIOBase, type_: Type[StructT]) -> StructT:
 value = type_()
 f.readinto(value)
 return value
def read_short(f: BufferedIOBase) -> int:
 fmt = '>H'
 buffer = f.read(struct.calcsize(fmt))
 value, = struct.unpack(fmt, buffer)
 return value
def read_indices(f: BufferedIOBase, n: int) -> tuple[int, ...]:
 fmt = f'>{n}H'
 buffer = f.read(struct.calcsize(fmt))
 return struct.unpack(fmt, buffer)
def bind_read(type_: Type[StructT]) -> Callable[[BufferedIOBase], StructT]:
 return partial(read_struct, type_=type_)
CONSTANT_READERS = {
 ConstantTag.CLASS: bind_read(ClassConstant),
 ConstantTag.DOUBLE: bind_read(DoubleConstant),
 ConstantTag.DYNAMIC: bind_read(DynamicConstant),
 ConstantTag.FIELD_REF: bind_read(FieldRefConstant),
 ConstantTag.FLOAT: bind_read(FloatConstant),
 ConstantTag.INTEGER: bind_read(IntegerConstant),
 ConstantTag.INTERFACE_METHOD_REF: bind_read(InterfaceMethodConstant),
 ConstantTag.INVOKE_DYNAMIC: bind_read(InvokeDynamicConstant),
 ConstantTag.LONG: bind_read(LongConstant),
 ConstantTag.METHOD_REF: bind_read(MethodRefConstant),
 ConstantTag.METHOD_HANDLE: bind_read(MethodHandleConstant),
 ConstantTag.METHOD_TYPE: bind_read(MethodTypeConstant),
 ConstantTag.MODULE: bind_read(ModuleConstant),
 ConstantTag.NAME_AND_TYPE: bind_read(NameAndTypeConstant),
 ConstantTag.PACKAGE: bind_read(PackageConstant),
 ConstantTag.STRING: bind_read(StringConstant),
 ConstantTag.UTF8: UTF8Constant.read,
}
def generate_constants(f: BufferedIOBase, n: int) -> Iterator[Constant]:
 for _ in range(n):
 tag_value, = f.read(1)
 tag = ConstantTag(tag_value)
 yield CONSTANT_READERS[tag](f)
def generate_attrs(f: BufferedIOBase, n: int) -> Iterator[AttributeInfo]:
 for _ in range(n):
 attr = read_struct(f, AttributeInfo)
 attr.data = f.read(attr.attribute_length)
 yield attr
class Class(NamedTuple):
 major_version: int
 minor_version: int
 access_flags: ClassAccess
 constants: tuple[Constant, ...]
 this_class: Constant
 super_class: Constant
 interfaces: tuple[Constant, ...]
 fields: tuple[FieldInfo, ...]
 methods: tuple[MethodInfo, ...]
 attributes: tuple[AttributeInfo, ...]
 @classmethod
 def deserialise(cls, f: BufferedIOBase) -> 'Class':
 version = read_struct(f=f, type_=Version)
 constant_pool_count = read_short(f)
 constant_pool = tuple(generate_constants(f, n=constant_pool_count - 1))
 access_flags = ClassAccess(read_short(f))
 this_class = read_short(f)
 super_class = read_short(f)
 interfaces_count = read_short(f)
 interfaces = read_indices(f=f, n=interfaces_count)
 fields_count = read_short(f)
 fields = [
 (
 field := read_struct(f, FieldInfo),
 tuple(generate_attrs(f, field.attributes_count)),
 )
 for _ in range(fields_count)
 ]
 methods_count = read_short(f)
 methods = [
 (
 method := read_struct(f, MethodInfo),
 tuple(generate_attrs(f, method.attributes_count)),
 )
 for _ in range(methods_count)
 ]
 attributes_count = read_short(f)
 attributes = tuple(generate_attrs(f, attributes_count))
 trailing = len(f.read())
 if trailing != 0:
 raise ValueError(f'{trailing} trailing bytes after deserialise')
 return cls._traverse(
 version=version, constants=constant_pool, access_flags=access_flags,
 this_idx=this_class, interfaces=interfaces,
 super_idx=super_class, attributes=attributes,
 fields=fields, methods=methods,
 )
 @classmethod
 def _traverse(
 cls,
 version: Version,
 constants: tuple[Constant, ...],
 access_flags: ClassAccess,
 this_idx: int,
 super_idx: int,
 interfaces: tuple[int, ...],
 fields: list[
 tuple[
 FieldInfo,
 tuple[AttributeInfo, ...],
 ]
 ],
 methods: list[
 tuple[
 MethodInfo,
 tuple[AttributeInfo, ...],
 ]
 ],
 attributes: tuple[AttributeInfo, ...],
 ) -> 'Class':
 field_constants = [field[0] for field in fields]
 method_constants = [method[0] for method in methods]
 all_nodes: tuple[Constant, ...] = (
 *constants,
 *field_constants,
 *method_constants,
 *chain.from_iterable(field[1] for field in fields),
 *chain.from_iterable(method[1] for method in methods),
 *attributes,
 )
 for constant in all_nodes:
 for child_name in constant.CHILDREN:
 varname = child_name.removesuffix('_index') + '_constant'
 child = constants[getattr(constant, child_name)]
 setattr(constant, varname, child)
 for field, attrs in fields:
 field.attributes = attrs
 for method, attrs in methods:
 method.attributes = attrs
 return cls(
 major_version=version.major_version,
 minor_version=version.minor_version,
 access_flags=access_flags,
 constants=constants,
 this_class=constants[this_idx],
 super_class=constants[super_idx],
 interfaces=tuple(
 constants[idx] for idx in interfaces
 ),
 fields=tuple(field_constants),
 methods=tuple(method_constants),
 attributes=attributes,
 )
 def dump(self, verbose: bool = False) -> Iterator[str]:
 yield f'Version {self.major_version}.{self.minor_version}'
 yield f'Class: {self.this_class}'
 yield f'Super: {self.super_class}'
 yield f'Access flags: {self.access_flags!r}'
 yield 'Attributes:'
 for attr in self.attributes:
 yield f' {attr}'
 yield 'Fields:'
 for field in self.fields:
 yield f' {field}'
 yield 'Methods:'
 for method in self.methods:
 yield f' {method}'
 yield 'Interfaces:'
 for iface in self.interfaces:
 yield f' {iface}'
 if verbose:
 yield 'Constant pool:'
 for constant in self.constants:
 yield f' {constant}'
def main() -> None:
 _, file_path = sys.argv
 with Path(file_path).open(mode='rb') as f:
 class_ = Class.deserialise(f)
 print('\n'.join(class_.dump(verbose=True)))
if __name__ == '__main__':
 main()

Output (simple example)

Version 65.0
Class: I "comparator"
Super: ()V "TopByOrder"
Access flags: <ClassAccess.SUPER|PUBLIC: 33>
Attributes:
 Ljava/util/Comparator<TE;>;
 TopByOrder.java
 InnerClasses
 CountingComparator
Fields:
 <FieldAccess.FINAL|PUBLIC: 17> I "comparator"
 <FieldAccess.FINAL|PUBLIC: 17> Ljava/util/Comparator; "(ILjava/util/Comparator;)V"Ljava/util/Comparator<TE;>;
Methods:
 <MethodAccess.PUBLIC: 1> ()V "java/util/Collection" @ LineNumberTable, Ljava/util/Comparator<TE;>;
 <MethodAccess.PUBLIC: 1> (Ljava/util/Collection;)Ljava/util/PriorityQueue; "java/util/Collection" @ LineNumberTable, Ljava/util/Comparator<TE;>;
 <MethodAccess.STATIC|PUBLIC: 9> ([Ljava/lang/String;)V "Ljava/lang/String;" @ LineNumberTable

Output (more complex example)

This one has Dynamics.

Version 65.0
Class: (Ljava/util/List;)Ljava/util/List; "(Ljava/util/List;)V"
Super: ()V "java/lang/Object"
Access flags: <ClassAccess.SUPER|FINAL|PUBLIC: 49>
Attributes:
 MultipleGroupPermuterDemo.java
 <ReferenceKind.invokeVirtual: 5> (Ljava/lang/Integer;)I "<ReferenceKind.invokeStatic: 6> (Ljava/lang/invoke/MethodHandles$Lookup;Ljava/lang/String;Ljava/lang/invoke/MethodType;Ljava/lang/String;[Ljava/lang/Object;)Ljava/lang/invoke/CallSite;"
 Lookup
Fields:
Methods:
 <MethodAccess.PUBLIC: 1> ()V "java/lang/Object" @ LineNumberTable
 <MethodAccess.STATIC|PUBLIC: 9> ([Ljava/lang/String;)V "groupPermutation" @ LineNumberTable
 <MethodAccess.STATIC|PRIVATE: 10> (Ljava/util/Map;)I "makeConcatWithConstants -> <init>" @ LineNumberTable, <T:Ljava/lang/Object;>(Ljava/util/Map<Ljava/util/List<Ljava/util/List<TT;>;>;Ljava/lang/Integer;>;)I
 <MethodAccess.STATIC|PRIVATE: 10> (Ljava/util/List;)Ljava/lang/String; "java/io/PrintStream" @ LineNumberTable, <T:Ljava/lang/Object;>(Ljava/util/Map<Ljava/util/List<Ljava/util/List<TT;>;>;Ljava/lang/Integer;>;)I
 <MethodAccess.STATIC|PRIVATE: 10> format "java/io/PrintStream" @ LineNumberTable, <T:Ljava/lang/Object;>(Ljava/util/Map<Ljava/util/List<Ljava/util/List<TT;>;>;Ljava/lang/Integer;>;)I
 <MethodAccess.STATIC|PRIVATE: 10> (Ljava/util/List;)Ljava/util/List; "(Ljava/util/List;)V" @ LineNumberTable, <T:Ljava/lang/Object;>(Ljava/util/Map<Ljava/util/List<Ljava/util/List<TT;>;>;Ljava/lang/Integer;>;)I
 <MethodAccess.STATIC|PRIVATE: 10> java/lang/System "computeGroupPermutations" @ LineNumberTable, <T:Ljava/lang/Object;>(Ljava/util/Map<Ljava/util/List<Ljava/util/List<TT;>;>;Ljava/lang/Integer;>;)I
answered Feb 25, 2024 at 4:23
\$\endgroup\$

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.