I'm writing some small programs to practice Rust, and this one's a string compressor for the golfing language Jelly (unofficial spec). It involves some non-ASCII stuff and bigints (from the num-bigint
crate).
use std::io;
use std::io::Write;
use num_bigint::{BigUint, ToBigUint};
use num_traits::cast::ToPrimitive;
use std::fs;
use std::convert::TryFrom;
fn low_first(word: &str) -> String {
let mut word_chars = word.chars();
let first = word_chars.next().unwrap();
first.to_lowercase().to_string() + &word_chars.collect::<String>()
}
enum MightContain {
Contains(usize),
ContainsCaps(usize),
Missing
}
fn dict_contains(word: &String, dict: &Vec<&str>) -> MightContain {
let low_word = word.to_lowercase();
let low_first_word = low_first(word);
let mut low: usize = 0;
let mut high: usize = dict.len();
while high > low {
let mid = (low + high) / 2;
let d_word = dict[mid];
let low_d_word = d_word.to_lowercase();
if low_d_word < low_word {
low = mid + 1;
} else if low_d_word > low_word {
high = mid;
} else {
if word == d_word {
return MightContain::Contains(mid);
} else if low_first_word == low_first(d_word) {
return MightContain::ContainsCaps(mid);
} else {
return MightContain::Missing;
}
}
}
MightContain::Missing
}
fn calc_index(s: usize, ds: usize, p: usize, q: usize) -> usize {
usize::try_from(
isize::try_from(ds - s).unwrap() -
isize::try_from((((s - 1) - p) * (((s - 1) - p) + 1)) / 2).unwrap() -
isize::try_from(p).unwrap() +
isize::try_from(q).unwrap() - 2
).unwrap()
}
fn char_diagram(word: &String) -> String {
let mut diagram = String::from("(");
for char in word.chars() {
diagram.push(char);
diagram.push_str(")(");
}
diagram.pop();
diagram
}
#[derive(Clone)]
struct ModPair {
add: BigUint,
scale: BigUint
}
fn char_coding(word: &String) -> ModPair {
let mut add = 0u32.to_biguint().unwrap();
let mut scale = 1u32.to_biguint().unwrap();
for char in word.chars().rev() {
add = (add * 96u32.to_biguint().unwrap() + (if char == '\n' { 95 } else { (char as u32) - 32 }).to_biguint().unwrap()) * 3u32.to_biguint().unwrap();
scale *= (96 * 3).to_biguint().unwrap();
}
return ModPair {
add: add,
scale: scale
};
}
fn dict_coding(short_dict: bool, caps: bool, wrong_sp: bool, index: usize) -> ModPair {
let mut add = index.to_biguint().unwrap();
let mut scale = (if short_dict { 20453u32 } else { 227845u32 }).to_biguint().unwrap();
add = add * 2u32.to_biguint().unwrap() + (if short_dict { 1u32 } else { 0u32 }).to_biguint().unwrap();
scale *= 2u32.to_biguint().unwrap();
if caps || wrong_sp {
add = (add * 3u32.to_biguint().unwrap() + (if wrong_sp { if caps { 2u32 } else { 1u32 } } else { 0u32 }).to_biguint().unwrap()) * 3u32.to_biguint().unwrap() + 2u32.to_biguint().unwrap();
scale *= 9u32.to_biguint().unwrap();
} else {
add = add * 3u32.to_biguint().unwrap() + 1u32.to_biguint().unwrap();
scale *= 3u32.to_biguint().unwrap();
}
return ModPair {
add: add,
scale: scale
};
}
fn missing_coding(first: &ModPair, second: &ModPair) -> ModPair {
ModPair {
add: &second.add * &first.scale + &first.add,
scale: &first.scale * &second.scale
}
}
fn main() {
print!("ASCII: ");
let mut input_string = String::new();
io::stdout().flush().unwrap();
io::stdin().read_line(&mut input_string).unwrap();
print!("\n");
input_string.pop();
let mut string = String::new();
for char in input_string.chars() {
if char == '¶' {
string.push('\n');
continue;
}
if char < ' ' || char > '~' {
eprintln!("Contains non-ASCII");
std::process::exit(1);
}
string.push(char);
}
let raw_short_dict = fs::read_to_string("short.dict").expect("Could not read short.dict");
let raw_long_dict = fs::read_to_string("long.dict").expect("Could not read long.dict");
let short_dict: Vec<&str> = raw_short_dict.split("\n").collect();
let long_dict: Vec<&str> = raw_long_dict.split("\n").collect();
let string_length = string.chars().count();
let tri_string_length = string_length * (string_length + 1) / 2;
if string_length == 0 {
println!("Optimal cost: {} bits", 0.0);
println!("Diagram: {}", "");
return;
}
if string_length == 1 {
println!("Optimal cost: {} bits", f64::round(f64::log2(3.0 * 96.0) * 100.0) / 100.0);
println!("Diagram: {}", string);
return;
}
let mut optimal_costs: Vec<f64> = vec![0.0; tri_string_length - string_length];
let mut diagrams: Vec<String> = vec!["".to_string(); tri_string_length - string_length];
let mut codings: Vec<ModPair> = vec![ModPair {
add: 0u32.to_biguint().unwrap(),
scale: 0u32.to_biguint().unwrap()
}; tri_string_length - string_length];
for i in 2..(string_length + 1) {
for k in 0..(string_length - (i - 1)) {
let word = string.chars().skip(k).take(i).collect::<String>();
let no_sp_word;
if word.chars().next().unwrap() == ' ' {
no_sp_word = word[1..].to_string();
} else {
no_sp_word = word[..].to_string();
}
let mut cost = f64::log2(3.0 * 96.0) * (i as f64);
let mut diagram = char_diagram(&word);
let mut coding = char_coding(&word);
let contains = if i < 60 { dict_contains(&no_sp_word, if no_sp_word.len() <= 5 { &short_dict } else { &long_dict }) } else { MightContain::Missing };
match contains {
MightContain::Contains(dict_index) => {
let word_cost = f64::log2(3.0 * 2.0 * (if (word.chars().next().unwrap() == ' ') == (k == 0) { 3.0 } else { 1.0 }) * (if no_sp_word.len() <= 5 { 20453.0 } else { 227845.0 }));
if word_cost < cost {
cost = word_cost;
diagram = format!("({})", word);
coding = dict_coding(no_sp_word.len() <= 5, false, (word.chars().next().unwrap() == ' ') == (k == 0), dict_index);
}
},
MightContain::ContainsCaps(dict_index) => {
let word_cost = f64::log2(3.0 * 2.0 * 3.0 * (if no_sp_word.len() <= 5 { 20453.0 } else { 227845.0 }));
if word_cost < cost {
cost = word_cost;
diagram = format!("({})", word);
coding = dict_coding(no_sp_word.len() <= 5, true, (word.chars().next().unwrap() == ' ') == (k == 0), dict_index);
}
},
MightContain::Missing => {
if i >= 3 {
let first_index = calc_index(string_length, tri_string_length, k + 1, k + i);
let first_cost = f64::log2(3.0 * 96.0) + optimal_costs[first_index];
if first_cost < cost {
cost = first_cost;
diagram = format!("({}){}", word.chars().next().unwrap(), diagrams[first_index]);
coding = missing_coding(&char_coding(&String::from(word.chars().next().unwrap())), &codings[first_index]);
}
let second_index = calc_index(string_length, tri_string_length, k, (k + i) - 1);
let second_cost = optimal_costs[second_index] + f64::log2(3.0 * 96.0);
if second_cost < cost {
cost = second_cost;
diagram = format!("{}({})", diagrams[second_index], word.chars().last().unwrap());
coding = missing_coding(&codings[second_index], &char_coding(&String::from(word.chars().last().unwrap())));
}
}
if i >= 4 {
for split in 2..((i + 1) - 2) {
let first_index = calc_index(string_length, tri_string_length, k, k + split);
let second_index = calc_index(string_length, tri_string_length, k + split, k + i);
let split_cost = optimal_costs[first_index] + optimal_costs[second_index];
if split_cost < cost {
cost = split_cost;
diagram = format!("{}{}", diagrams[first_index], diagrams[second_index]);
coding = missing_coding(&codings[first_index], &codings[second_index]);
}
// cost = cmp::min_by(cost, , |x, y| x.partial_cmp(y).unwrap())
}
}
}
}
let index = calc_index(string_length, tri_string_length, k, k + i);
optimal_costs[index] = cost;
diagrams[index] = diagram;
codings[index] = coding;
}
}
let string_index = calc_index(string_length, tri_string_length, 0, string_length);
println!("Optimal cost: {} bits", f64::round(optimal_costs[string_index] * 100.0) / 100.0);
println!("Diagram: {}", diagrams[string_index]);
print!("\n");
let code_page = concat!(
×ばつØŒÞßæçðıȷñ÷øœþ !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~¶",
"°123456789+−=()ƁƇƊƑƓƘⱮƝƤƬƲȤɓƈɗƒɠɦƙɱɲƥʠɼʂƭʋȥẠḄḌẸḤỊḲḶṂṆỌṚṢṬỤṾẈỴẒȦḂĊḊĖḞĠḢİL·ṀṄȮṖṘṠṪẆẊẎŻạḅḍẹḥịḳḷṃṇọṛṣṭ§Äẉỵẓȧḃċḋėḟġḣl·ṁṅȯṗṙṡṫẇẋẏż"
);
let mut final_string = String::new();
let mut n = codings[string_index].add.clone();
while n != 0u32.to_biguint().unwrap() {
let mut digit = &n % 250u32.to_biguint().unwrap();
if digit == 0u32.to_biguint().unwrap() {
digit = 250u32.to_biguint().unwrap();
}
n = (&n - &digit) / 250u32.to_biguint().unwrap();
digit -= 1u32.to_biguint().unwrap();
final_string.push(code_page.chars().collect::<Vec<char>>()[digit.to_usize().unwrap()]);
}
println!("String: {}", final_string.chars().rev().collect::<String>());
}
It reads a dictionary of words from files called short.dict
and long.dict
(you can find those here if you want to run it yourself), and its dependencies are num-bigint
0.4.3 and num-traits
0.2.11. When you run it, you just type some text, and it'll show you how it's grouped into words and what the resulting compressed output is. I think it's approximately O(n3)
, but it's reasonably fast for inputs up to around a half kilobyte.
I'm pretty new to Rust so feedback on any parts of it would be helpful, but in particular:
- Is there a better way to initialize the vector
codings
? Currently I fill it with uselessModPair
s which get overwritten later, which feels like a waste of time and memory, but usingVec::with_capacity(...)
would cause errors when I tried to use the vector (since the capacity was right but the length was0
) - Is there a better way to do math with signed and unsigned number types in things like the
calc_index
function? I have to do a ton oftry_from
s andunwrap
s, since subtracting theusize
s sometimes would result in negative numbers in an in-between step (but the result is always positive, so I have totry_from
andunwrap
to ausize
again) - Is there a more elegant way to do math with bigints? For every constant I add or multiply to a bigint, I have to do something like
3u32.to_biguint().unwrap()
, which is really clumsy and verbose
Thanks!
1 Answer 1
Q & A
Is there a better way to initialize the vector
codings
? Currently I fill it with uselessModPairs
which get overwritten later, which feels like a waste of time and memory, but usingVec::with_capacity(...)
would cause errors when I tried to use the vector (since the capacity was right but the length was0
)
let codings = Vec::with_capacity(/* ... */);
for i in /* ... */ {
// instead of `codings[i] = /* ... */`
codings.push(/* ... */);
}
Is there a better way to do math with signed and unsigned number types in things like the
calc_index
function? I have to do a ton oftry_from
s andunwrap
s, since subtracting theusize
s sometimes would result in negative numbers in an in-between step (but the result is always positive, so I have totry_from
andunwrap
to ausize
again)
Take calc_index
for example.
The formula is a bit beyond my understanding,
but I think you can avoid negative values
by rearranging the terms
such that addition happens before subtraction
(if overflow isn't an issue):
fn calc_index(s: usize, ds: usize, p: usize, q: usize) -> usize {
q + ds - s - ((((s - 1) - p) * (((s - 1) - p) + 1)) / 2) - p - 2
}
Is there a more elegant way to do math with bigints? For every constant I add or multiply to a bigint, I have to do something like
3u32.to_biguint().unwrap()
, which is really clumsy and verbose
Operations such as BigUint * u32
are supported.
See the section on char_coding
.
Formatting
Running rustfmt
(via cargo fmt
) formats the code
according to the official Rust Style Guide.
Personally, I like to merge use
declarations:
use {
num_bigint::{BigUint, ToBigUint},
num_traits::cast::ToPrimitive,
std::{
convert::TryFrom,
fs,
io::{self, Write},
},
};
Clippy
cargo clippy
gives the following suggestions:
print!("\n")
becomesprintln!()
, andprintln!("Diagram: {}", "")
is justprintln!("Diagram: ")
.else { if .. }
can be collapsed intoelse if ..
.ModPair { add: add, scale: scale }
can be replaced with the shorthandModPair { add, scale }
.Functions should generally take
&str
instead of&String
. See [Why is it discouraged to accept a reference to aString
(&String
),Vec
(&Vec
), orBox
(&Box
) as a function argument?]Explicit
return
is unnecessary at the end of a block.Clippy suggests rewriting
char < ' ' || char > '~'
as!(' '..='~').contains(&char)
..split("\n")
can be.split('\n')
.word.chars().next().unwrap() == ' '
can be simplified toword.starts_with(' ')
(except the latter does not panic ifword.is_empty()
).
low_first
fn low_first(word: &str) -> String { let mut word_chars = word.chars(); let first = word_chars.next().unwrap(); first.to_lowercase().to_string() + &word_chars.collect::<String>() }
collect
allocates a String
,
which is appended to another String
;
quite wasteful (in theory, at least).
In fact, Chars
(the type of word.chars()
)
provide a method as_str
that exposes its &str
representation:
first.to_lowercase().to_string() + word_chars.as_str()
dict_contains
Take word: &str, dict: &[&str]
as arguments instead.
We can use binary_search_by_key
to simplify the function:
fn dict_contains(word: &str, dict: &[&str]) -> MightContain {
let low_word = word.to_ascii_lowercase();
let low_first_word = low_first(word);
let pos = match dict.binary_search_by_key(&low_word, |s| s.to_ascii_lowercase()) {
Ok(pos) => pos,
Err(_) => return MightContain::Missing,
};
let d_word = dict[pos];
if word == d_word {
MightContain::Contains(pos)
} else if low_first_word == low_first(d_word) {
MightContain::ContainsCaps(pos)
} else {
MightContain::Missing
}
}
Note that to_ascii_lowercase
is better than to_lowercase
if the string is known to be ASCII-only.
char_diagram
More intuitively:
fn char_diagram(word: &String) -> String {
word.chars().flat_map(|c| ['(', c, ')']).collect()
}
char_coding
96u32.to_biguint().unwrap()
can be simplified to BigUint::from(96_u32)
.
Eliminating these magic constants helps clarify their meanings.
Also, I find *=
and +=
more readable here:
fn char_coding(word: &String) -> ModPair {
use num_traits::{One, Zero};
let mut add = BigUint::zero();
let mut scale = BigUint::one();
// better names welcome
const ASCII_OFFSET: u32 = 32;
const CHAR_RADIX: u32 = 96;
const CODE_RADIX: u32 = 3;
for c in word.chars().rev() {
add *= CHAR_RADIX;
add += match c {
'\n' => CHAR_RADIX - 1,
_ => c as u32 - ASCII_OFFSET,
};
add *= CODE_RADIX;
scale *= CHAR_RADIX * CODE_RADIX;
}
ModPair { add, scale }
}
Note that BigUint
supports built-in unsigned integer types
on the right-hand side of arithmetic operators.
I also renamed char
to c
since char
is a built-in type name.
Similar comments apply to dict_coding
.
Printing
print!("ASCII: "); // ... io::stdout().flush().unwrap(); // ... print!("\n");
I recommended printing the prompt to stderr
,
which not only eliminates the need to manually flush
but also allows the user to separate actual output from prompts
by separating stdout
and stderr
:
eprint!("ASCII: ");
Pre-processing input
let mut string = String::new(); for char in input_string.chars() { if char == '¶' { string.push('\n'); continue; } if char < ' ' || char > '~' { eprintln!("Contains non-ASCII"); std::process::exit(1); } string.push(char); }
How about
let string = input_string.replace('¶', "\n");
if string.chars().any(|c| !(' '..='~').contains(&c)) {
eprintln!("Contains non-ASCII");
std::process::exit(1);
}
Other parts of main
main
is a very long function.
It would be more convenient if the paths to the dictionaries are provided as arguments.
2..(string_length + 1)
is 2..=string_length
.
string.chars().skip(k).take(i).collect::<String>
is string[k..(k + i)].to_owned()
since string
is ASCII-only.
Instead of
let no_sp_word; if word.chars().next().unwrap() == ' ' { no_sp_word = word[1..].to_string(); } else { no_sp_word = word[..].to_string(); }
we have
let no_sp_word = if word.starts_with(' ') {
&word[1..]
} else {
&word[..]
};
(note that allocation is unnecessary).
-
\$\begingroup\$ Thanks, there's a lot of helpful information here! I don't think the suggestion of
push
ing tocodings
would work since I don't insert items in order, but maybe I couldpush
to a secondVec
, and use aVec<usize>
to map between the indices. \$\endgroup\$rydwolf– rydwolf2022年06月29日 15:42:13 +00:00Commented Jun 29, 2022 at 15:42 -
1\$\begingroup\$ @RadvylfPrograms Oops - didn't notice that the insertions are out of order. In that case, I would stick with the current version, since initializing with all zero values is probably going to compile to a simple
memset
and unlikely to cause much overhead anyway. If the elements are sparse enough, switching to another data structure (such asHashMap
) might also be an option. \$\endgroup\$L. F.– L. F.2022年06月29日 15:47:23 +00:00Commented Jun 29, 2022 at 15:47
Contains non-ASCII
otherwise, though I didn't test that. \$\endgroup\$