I have written a Python script that converts scripts to png files, entirely by myself without anybody's help.
A picture is worth more than a thousand words so I will let the picture do the descriptions.
Code
import numpy as np
from io import BytesIO
from pathlib import Path
from PIL import Image
from pygments import highlight
from pygments.formatters import ImageFormatter
from pygments.lexers import get_lexer_by_name
from pygments.style import Style
from pygments.token import Name, Comment, String, Punctuation, Number, Operator, Literal, Keyword
class MyStyle(Style):
styles = {
Keyword: 'bold #07487f bg:#00122a',
Name: '#7b68ee bg:#00122a',
Name.Builtin: 'italic #fb4570',
Name.Class: 'bold italic #ffd700',
Name.Constant: 'bold',
Name.Decorator: 'bold italic',
Name.Exception: 'bold italic #ff005d',
Name.Function: 'bold italic #134dbc',
Name.Namespace: 'bold italic #ffa000',
Name.Variable.Class: 'bold',
Name.Variable.Global: 'bold',
Literal: '#00ff80 bg:#00122a',
String: '#20ff40 bg:#00122a',
String.Escape: 'italic #d06000',
Number: '#00c0ff bg:#00122a',
Operator: 'bold #408020 bg:#00122a',
Operator.Word: 'bold #0000f0',
Punctuation: '#ff00c0 bg:#00122a',
Comment: 'italic #800080 bg:#00122a'
}
def script2png(filepath, language='python'):
code = '\n'.join([i.rstrip() for i in Path(
filepath).read_text(encoding='utf8').splitlines()])
formatter = ImageFormatter(style=MyStyle, full=True,
font_name='Source Code Pro', font_size=16,
line_number_bg='#003b6f', line_number_fg='#ff69b4',
line_number_bold=True, line_pad=3)
image = highlight(code, get_lexer_by_name(language), formatter)
image = np.array(Image.open(BytesIO(image)))
image[np.all(image == (255, 255, 255), axis=-1)] = (0, 18, 42)
image = Image.fromarray(image)
image.save(filepath+'.png')
How is the code? This is my first time writing something like this, and I really want to know how to improve it.
1 Answer 1
You're importing a lot of symbols from pygments.token
. It is probably cleaner to alias that import and clean up your namespace a bit.
I don't see why you're jumping through so many hoops when splitting and joining your text. Why not just read the entire file as a string and leave it be?
pygments
is poorly designed. It has a lot of internal inconsistencies, such as some Style
members being respected for some formatter subclasses and not others. You should be moving as much styling information to your style class as possible. pygments
has a deeply confused interpretation of line_number_bg
: it can (and often is) expressed as Style.line_number_background_color
, but the latter is only respected in CSS mode. Still: better to define it on your class, and manually pass it in for the ImageFormatter
constructor that is broken and does not respect the style.
Do not use Numpy or Pillow and do not replace your background by colour. Instead, define Style.background_color
.
Both your BytesIO
and your Image
should be put in a with
for guaranteed closure.
Unless you need to do something strange and non-standard, it is more convenient to remove your language
argument and replace get_lexer_by_name
with get_lexer_for_filename
.
If you absolutely require a .png
you should be passing that format explicitly to the formatter. Otherwise, a reasonable implementation would be a script2img
that allows the ImageFormatter
to use its default format, whatever that may be; does not hard-code the .png
suffix; and instead pulls that from formatter.image_format
.
Suggested
from pathlib import Path
from pygments import highlight
from pygments.formatters import ImageFormatter
from pygments.lexers import get_lexer_for_filename
from pygments.style import Style
import pygments.token as tokens
class MyStyle(Style):
background_color = '#00122a'
# Only applies to HtmlFormatter
line_number_background_color = '#003b6f'
line_number_color = '#ff69b4'
styles = {
tokens.Keyword: f'bold #07487f bg:{background_color}',
tokens.Name: f'#7b68ee bg:{background_color}',
tokens.Name.Builtin: 'italic #fb4570',
tokens.Name.Class: 'bold italic #ffd700',
tokens.Name.Constant: 'bold',
tokens.Name.Decorator: 'bold italic',
tokens.Name.Exception: 'bold italic #ff005d',
tokens.Name.Function: 'bold italic #134dbc',
tokens.Name.Namespace: 'bold italic #ffa000',
tokens.Name.Variable.Class: 'bold',
tokens.Name.Variable.Global: 'bold',
tokens.Literal: f'#00ff80 bg:{background_color}',
tokens.String: f'#20ff40 bg:{background_color}',
tokens.String.Escape: 'italic #d06000',
tokens.Number: f'#00c0ff bg:{background_color}',
tokens.Operator: f'bold #408020 bg:{background_color}',
tokens.Operator.Word: 'bold #0000f0',
tokens.Punctuation: f'#ff00c0 bg:{background_color}',
tokens.Comment: f'italic #800080 bg:{background_color}',
}
def script2img(filepath: Path) -> None:
formatter = ImageFormatter(
style=MyStyle, full=True, line_pad=3,
font_name='Source Code Pro', font_size=16,
line_number_bold=True,
line_number_fg=MyStyle.line_number_color,
line_number_bg=MyStyle.line_number_background_color,
)
buffer = highlight(
code=filepath.read_text(),
lexer=get_lexer_for_filename(filepath),
formatter=formatter,
)
with filepath.with_suffix(
'.' + formatter.image_format
).open('wb') as f:
f.write(buffer)
if __name__ == '__main__':
script2img(Path('275768.py'))
Output
Rendering its own source,
This works equally well (still without passing a language
) when inferring Java:
Caching configuration
If you're rendering multiple files at a time, you may want to make a class version that caches the instantiated formatter:
from pathlib import Path
from pygments import highlight
from pygments.formatters import ImageFormatter
from pygments.lexers import get_lexer_for_filename
from pygments.style import Style
import pygments.token as tokens
class MyStyle(Style):
background_color = '#00122a'
# Only applies to HtmlFormatter
line_number_background_color = '#003b6f'
line_number_color = '#ff69b4'
styles = {
tokens.Keyword: f'bold #07487f bg:{background_color}',
tokens.Name: f'#7b68ee bg:{background_color}',
tokens.Name.Builtin: 'italic #fb4570',
tokens.Name.Class: 'bold italic #ffd700',
tokens.Name.Constant: 'bold',
tokens.Name.Decorator: 'bold italic',
tokens.Name.Exception: 'bold italic #ff005d',
tokens.Name.Function: 'bold italic #134dbc',
tokens.Name.Namespace: 'bold italic #ffa000',
tokens.Name.Variable.Class: 'bold',
tokens.Name.Variable.Global: 'bold',
tokens.Literal: f'#00ff80 bg:{background_color}',
tokens.String: f'#20ff40 bg:{background_color}',
tokens.String.Escape: 'italic #d06000',
tokens.Number: f'#00c0ff bg:{background_color}',
tokens.Operator: f'bold #408020 bg:{background_color}',
tokens.Operator.Word: 'bold #0000f0',
tokens.Punctuation: f'#ff00c0 bg:{background_color}',
tokens.Comment: f'italic #800080 bg:{background_color}',
}
class Highlighter:
def __init__(self) -> None:
self.formatter = ImageFormatter(
style=MyStyle, full=True, line_pad=3,
font_name='Source Code Pro', font_size=16,
line_number_bold=True,
line_number_fg=MyStyle.line_number_color,
line_number_bg=MyStyle.line_number_background_color,
)
self.extension = '.' + self.formatter.image_format
def script_to_image(self, filepath: Path) -> Path:
buffer = highlight(
code=filepath.read_text(),
lexer=get_lexer_for_filename(filepath),
formatter=self.formatter,
)
out_path = filepath.with_suffix(self.extension)
with out_path.open('wb') as f:
f.write(buffer)
return out_path
if __name__ == '__main__':
Highlighter().script_to_image(Path('275768.py'))
-
1\$\begingroup\$ Good answer, but I can't accept it just yet. You know, gotta wait for 24 hours so people around the globe can have a chance to see the question and post an answer. \$\endgroup\$Ξένη Γήινος– Ξένη Γήινος2022年04月15日 19:14:09 +00:00Commented Apr 15, 2022 at 19:14
-
\$\begingroup\$ About splitting and joining the text, it is to remove trailing white spaces, I sometimes write code on Windows Terminal to see what the output will be, and I then copy pasted the code to Visual Studio Code, I used Alt+Drag to select the code, and this will leave trailing spaces, I guess they will make the output sub-optimal so I want to remove them. \$\endgroup\$Ξένη Γήινος– Ξένη Γήινος2022年04月16日 05:13:47 +00:00Commented Apr 16, 2022 at 5:13
-
\$\begingroup\$ The only effect will be a broadened right margin. I consider that harmless, but if you really care about it, you could follow your original approach or do something like a
re.sub(r' *\n', '\n'
. \$\endgroup\$Reinderien– Reinderien2022年04月16日 11:32:20 +00:00Commented Apr 16, 2022 at 11:32