It's been about a day since I've started writing some Rust, and I'm looking for some code review on a simple spell checker I've written up
use generator::{Gn, Generator, Scope, done};
use std::collections::{HashMap, HashSet};
use std::ops::Add;
use std::io::{BufRead, BufReader};
use std::fs::File;
#[derive(Debug)]
struct EditWord {
word: String,
editDistance: usize,
}
impl EditWord {
fn new(w: String, editDistance: usize) -> EditWord {
return EditWord { word: w, editDistance };
}
}
static ASCII_LOWER: [char; 26] = [
'a', 'b', 'c', 'd', 'e',
'f', 'g', 'h', 'i', 'j',
'k', 'l', 'm', 'n', 'o',
'p', 'q', 'r', 's', 't',
'u', 'v', 'w', 'x', 'y',
'z',
];
type Stream<'s, T> = Generator<'s, (), T>;
#[derive(Debug)]
pub struct WordDataSet {
counter: HashMap<String, usize>
}
impl<'a> From<Vec<&'a str>> for WordDataSet {
fn from(vec: Vec<&'a str>) -> Self {
let mut counter = HashMap::new();
for w in vec {
*counter.entry(w.to_string()).or_default() += 1;
}
return WordDataSet { counter };
}
}
impl WordDataSet {
pub fn prob(&self, word: &str) -> f64 {
if !self.counter.contains_key(word) {
return 0.0;
}
return *self.counter.get(word).unwrap() as f64 / self.counter.values().sum::<usize>() as f64;
}
fn exists(&self, word: &str) -> bool {
return self.counter.contains_key(word);
}
}
fn splits(w: &str) -> Vec<(&str, &str)> {
(0..=w.len()).map(|i| w.split_at(i)).collect()
}
pub struct SimpleCorrector {
data_set: WordDataSet
}
impl SimpleCorrector {
pub fn correct(&self, word: &str) -> Option<String> {
if self.data_set.exists(word) {
return Some(word.to_string());
}
edits(1, word)
.filter(|e| self.data_set.exists(&e.word))
.map(|e| ((1 / e.editDistance) as f64 * self.data_set.prob(&e.word), e.word))
.max_by(|(p1, w1), (p2, w2)| p1.partial_cmp(p2).expect("Tried to compare NAN"))
.map(|(p, w)| w)
}
}
fn edit1(w: &str) -> Stream<String> {
let pairs = splits(w);
let g = Gn::new_scoped(move |mut s| {
//deletes
for (a, b) in pairs.iter() {
let delete = format!("{}{}", a, b.get(1..).unwrap_or_default());
s.yield_(delete);
}
for (a, b) in pairs.iter() {
for c in ASCII_LOWER.iter() {
//replace
let replace = format!("{}{}{}", a, c, b.get(1..).unwrap_or_default());
s.yield_(replace);
//insert
let insert = format!("{}{}{}", a, c, b);
s.yield_(insert);
}
}
done!();
});
return g;
}
fn edits(n: usize, word: &str) -> Stream<EditWord> {
let g = Gn::new_scoped(move |mut s| {
let mut v = vec![word.to_string()];
let mut seen = HashSet::new();
seen.insert(word.to_string());
for i in 0..n {
let mut next_list = vec![];
for word in v {
for w in edit1(&word) {
if !seen.contains(&w) {
next_list.push(w.to_string());
seen.insert(w.to_string());
let editWord = EditWord::new(w.to_string(), i + 1);
s.yield_(editWord);
}
}
}
v = next_list;
}
done!();
});
return g;
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_word_prob() {
let data_set = WordDataSet::from(vec!["A", "B"]);
assert_eq!(data_set.prob("B"), 0.5)
}
#[test]
fn test_word_split() {
let word = "abc";
let word_splits = splits(word);
assert_eq!(word_splits, vec![("", "abc"),
("a", "bc"),
("ab", "c"),
("abc", "")])
}
#[test]
fn test_corrector_on_valid_word() {
let word = "ab";
let word_list = vec!["ab", "cd"];
let word_dataset = WordDataSet::from(word_list);
let s = SimpleCorrector { data_set: word_dataset };
let res = s.correct("ab");
dbg!(res);
}
#[test]
fn test_corrector_on_invalid_word() {
let test_word = "aa";
let word_list = vec!["ab", "cd"];
let word_dataset = WordDataSet::from(word_list);
let s = SimpleCorrector { data_set: word_dataset };
let res = s.correct(test_word);
assert_eq!(res.unwrap(), "ab");
}
}
Deps:
[dependencies]
generator = "0.6.19"
I've got to add more tests to test the logic again, but so far I think it's alright. I'm mainly looking for pointers on how I can exploit the type system better, and how to write shorter, ergonomic code.
I'm quite sure the .to_string()
calls in the edits
function can be reduced, but not sure how.
-
\$\begingroup\$ @Shepmaster, whoops that was just an extraneous artifiact from before -- removed it \$\endgroup\$nz_21– nz_212020年01月08日 15:48:23 +00:00Commented Jan 8, 2020 at 15:48
1 Answer 1
First thoughts
Read and act on the warnings provided by the compiler. That's one of the biggest points of a compiled language. Do not ignore the 10+ warnings. You have a ton of unused imports, unused variables, and non-idiomatic variable names.
Run Rustfmt, a tool for automatically formatting Rust code to the community-accepted style.
Run Clippy, a tool for finding common mistakes that may not be compilation errors but are unlikely to be what the programmer intended. This points out that almost all of your uses of
return
are non-idiomatic.
impl EditWord
- Don't abbreviate the argument
w
; call itword
and use the struct shorthand syntax.
impl<'a> From<Vec<&'a str>> for WordDataSet
There's no obvious reason to require a
Vec
. This could equally beFromIterator
.The required conversion to
String
is unfortunate. How do I use the Entry API with an expensive key that is only constructed if the Entry is Vacant? talks about a future solution. Also, you could avoid allocation completely here by storing the string slices instead.
impl WordDataSet
Don't look up in a map twice. Instead, use
get
once and work with theOption
.Using
as
is dubious for integer <-> floating point conversions. It's better to useTryFrom
and panic instead. See How do I convert between numeric types safely and idiomatically? for more details.
fn splits
- This doesn't need to return a
Vec
as you only iterate over it. You can return an iterator instead. In this case, you also need to make itClone
so that you can use it twice inedit1
, but you could probably iterate just once or callsplits
twice, as it's cheap. See What is the correct way to return an Iterator (or any other trait)?.
impl SimpleCorrector
- Instead of always returning a
String
, you could return aCow<str>
. This would avoid allocation when the word is correctly spelled.
fn edit1
- It's more common to use
&collection
thancollection.iter()
.
fn test_corrector_on_valid_word
- This has no assertions.
fn test_corrector_on_invalid_word
- I avoid using
unwrap
whenever possible. It's fine in tests, but I still try.
General
From an outside API, it's not clear what benefit there is to exposing both
WordDataSet
andSimpleCorrector
. Perhaps the API should be restricted to one.I'm not excited about the
generator
crate. This may just be ignorance on my part, but I'd probably strive to use standard iterator techniques until Rust gains stable first-class support for generators
use generator::{done, Generator, Gn};
use std::{
collections::{HashMap, HashSet},
iter::FromIterator,
};
#[derive(Debug)]
struct EditWord {
word: String,
edit_distance: usize,
}
impl EditWord {
fn new(word: String, edit_distance: usize) -> EditWord {
EditWord {
word,
edit_distance,
}
}
}
static ASCII_LOWER: [char; 26] = [
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
't', 'u', 'v', 'w', 'x', 'y', 'z',
];
type Stream<'s, T> = Generator<'s, (), T>;
#[derive(Debug)]
pub struct WordDataSet {
counter: HashMap<String, usize>,
}
impl<'a> FromIterator<&'a str> for WordDataSet {
fn from_iter<I>(words: I) -> Self
where
I: IntoIterator<Item = &'a str>,
{
let mut counter = HashMap::new();
for w in words {
*counter.entry(w.to_string()).or_default() += 1;
}
WordDataSet { counter }
}
}
impl WordDataSet {
pub fn prob(&self, word: &str) -> f64 {
self.counter.get(word).map_or(0.0, |&c| {
let sum: usize = self.counter.values().sum();
c as f64 / sum as f64
})
}
fn exists(&self, word: &str) -> bool {
self.counter.contains_key(word)
}
}
fn splits(w: &str) -> impl Iterator<Item = (&str, &str)> + Clone {
(0..=w.len()).map(move |i| w.split_at(i))
}
pub struct SimpleCorrector {
data_set: WordDataSet,
}
impl SimpleCorrector {
pub fn correct(&self, word: &str) -> Option<String> {
if self.data_set.exists(word) {
return Some(word.to_string());
}
edits(1, word)
.filter(|e| self.data_set.exists(&e.word))
.map(|e| {
(
(1 / e.edit_distance) as f64 * self.data_set.prob(&e.word),
e.word,
)
})
.max_by(|(p1, _w1), (p2, _w2)| p1.partial_cmp(p2).expect("Tried to compare NAN"))
.map(|(_p, w)| w)
}
}
fn edit1(w: &str) -> Stream<String> {
let pairs = splits(w);
Gn::new_scoped(move |mut s| {
//deletes
for (a, b) in pairs.clone() {
let delete = format!("{}{}", a, b.get(1..).unwrap_or_default());
s.yield_(delete);
}
for (a, b) in pairs {
for c in &ASCII_LOWER {
//replace
let replace = format!("{}{}{}", a, c, b.get(1..).unwrap_or_default());
s.yield_(replace);
//insert
let insert = format!("{}{}{}", a, c, b);
s.yield_(insert);
}
}
done!();
})
}
fn edits(n: usize, word: &str) -> Stream<EditWord> {
Gn::new_scoped(move |mut s| {
let mut v = vec![word.to_string()];
let mut seen = HashSet::new();
seen.insert(word.to_string());
for i in 0..n {
let mut next_list = vec![];
for word in v {
for w in edit1(&word) {
if !seen.contains(&w) {
next_list.push(w.to_string());
seen.insert(w.to_string());
let edit_word = EditWord::new(w.to_string(), i + 1);
s.yield_(edit_word);
}
}
}
v = next_list;
}
done!();
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_word_prob() {
let data_set = WordDataSet::from_iter(vec!["A", "B"]);
assert_eq!(data_set.prob("B"), 0.5)
}
#[test]
fn test_word_split() {
let word = "abc";
let word_splits = splits(word).collect::<Vec<_>>();
assert_eq!(
word_splits,
vec![("", "abc"), ("a", "bc"), ("ab", "c"), ("abc", "")]
)
}
#[test]
fn test_corrector_on_valid_word() {
let word_list = vec!["ab", "cd"];
let data_set = WordDataSet::from_iter(word_list);
let s = SimpleCorrector { data_set };
let res = s.correct("ab");
dbg!(res);
}
#[test]
fn test_corrector_on_invalid_word() {
let test_word = "aa";
let word_list = vec!["ab", "cd"];
let data_set = WordDataSet::from_iter(word_list);
let s = SimpleCorrector { data_set };
let res = s.correct(test_word);
assert_eq!(res.as_deref(), Some("ab"));
}
}