One of many aspects of a project I am responsible for is generating an abbreviation, that is no more than n
characters, for a given string. We use this in the organization for a couple things, and it only happens once for each "thing", so performance isn't hyper-critical.
An example is something like "The Main Test Company" — when asking for a 3-character abbreviation, I want to be able to generate a string like MTC
. Now this would also be the same string as My Tiny Cat
, but the idea is that a given company can be abbreviated and it can be done safely and effectively.
Also, if the abbreviations aren't large enough for the requested characters (say we request 4 characters from My Awesome Company
) it should, for each abbreviation character, start adding characters to the total (so our example is MYAC
).
I wrote the whole thing in F#, and it seems pretty readable and followable, I'm also including some test strings to verify functionality. Any comments are welcome.
let genAbbr ignoreList padding charCount (str : string) =
if str.Length <= charCount then
match padding with
| Some c -> str |> Seq.append (String(c, charCount - str.Length)) |> Seq.toArray
| None -> str.ToCharArray()
|> Array.map Char.ToUpper
|> String
else
let words =
[|' '|]
|> Array.append (str.ToCharArray())
|> Array.fold (fun (acc, cs) c ->
match c |> Char.IsUpper, c = ' ', cs with
| true, _, [] -> (acc, [c])
| true, _, cs -> (cs::acc, [c])
| _, true, [] -> (acc, [])
| _, true, cs -> (cs::acc, [])
| _, _, cs -> (acc, c::cs)) ([], [])
|> fst
|> Array.ofList
|> Array.map (Array.ofList >> Array.rev >> String)
|> Array.rev
|> Array.map (fun s -> s.ToCharArray())
let ignoreWords = words |> Array.filter (fun w -> ignoreList |> Array.contains ((w |> String).ToLower()) |> not)
let abbrCs =
if ignoreWords |> Array.length >= charCount then ignoreWords |> Array.map Seq.head
else if words |> Array.length >= charCount then words |> Array.map Seq.head
else
let firstCaps = words |> Array.map (fun s -> if s.Length > charCount then s.[0..charCount] else s) |> Array.length
printfn "%A %A %A" ignoreWords words firstCaps
words
|> Array.fold (fun (acc, r) w ->
if w.Length > r then (w.[0..(r - 1)]::acc, 1)
else (w::acc, r - (w.Length - 1))) ([], charCount - firstCaps + 1)
|> fst
|> Array.ofList
|> Array.rev
|> Array.concat
|> Array.map Char.ToUpper
abbrCs |> Array.take charCount |> String
And lastly, some POC's:
["The Main Test Company"; "Main Test Company"; "The Main Company"; "The Company"; "Company"; "MainCompany"; "The MainCompany"; "MainTestCompany"; "SomeRX"; "SomeCompanyT"]
|> List.map (genAbbr [|"the"|] None 3)
// Should be ["MTC"; "MTC"; "TMC"; "THC"; "COM"; "MAC"; "TMC"; "MTC"; "SRX"; "SCT"]
Thanks for any commentary, and I look forward to this being brutally decimated. :)
-
2\$\begingroup\$ I took a nap, so my brain is less fried, but still a little fried. I keep trying to follow this code, but all I keep thinking is "Extract functions. Name things." \$\endgroup\$RubberDuck– RubberDuck2018年01月10日 01:01:52 +00:00Commented Jan 10, 2018 at 1:01
2 Answers 2
Some possible improvement points:
- Splitting out functions for word grouping, padding, and word filtering
- Removing unnecessary branching
- Moving from
fold
to self-contained recursive functions that require less post processing - Use a
HashSet
with aStringComparer
for filtering
Using these techniques, and avoiding regexes (like my previous answer), we can get a solution that may be easier to reason about:
open System.Collections.Generic
let getWords (str:string) =
let word rev_chars =
(rev_chars |> List.rev |> Array.ofList |> String).ToUpper()
let add_word rev_chars acc =
if rev_chars <> [] then (word rev_chars) :: acc else acc
let rec loop (idx:int) (rev_chars:char list) (acc: string list) prev_lower =
if idx < str.Length then
let c = str.[idx]
let rev_chars, acc =
match str.[idx] with
| ' ' -> [] , (add_word rev_chars acc)
| c when Char.IsUpper(c) -> [c] , (add_word rev_chars acc)
| c -> (c::rev_chars) , acc
loop (idx + 1) rev_chars acc (Char.IsLower c)
else
let acc = add_word rev_chars acc in List.rev acc
loop 0 [] [] false
let pad (str:string) len padchar =
if str.Length < len then str + String(padchar, (len - str.Length)) else str
// Filter a list of words, if we can 'afford' it, starting from the back
let filterWords (ignoreList:HashSet<string>) charCount words =
let rec loop revWords acc =
match revWords with
| [] -> acc
| word::revWords when ignoreList.Contains(word) ->
let getLen = List.fold (fun len s -> len + String.length s) 0
let maxLenWithout = (getLen words) + (getLen acc)
let numOtherParts = List.length revWords + List.length acc
if maxLenWithout >= charCount && numOtherParts >= charCount then loop revWords acc
else loop revWords (word::acc)
| word::revWords -> loop revWords (word::acc)
loop (List.rev words) []
let genAbbr (ignoreList:HashSet<string>) padding charCount (str : string) =
let words = getWords str |> filterWords ignoreList charCount
// Given a list of words, generate a sequence of abbreviations
let rec abbrs before (words: string list) : string seq = seq {
match words with
| [] -> ()
| word::words ->
for i = 1 to word.Length do
let before = before + word.Substring(0, i)
yield before + (words |> List.map (fun w -> w.Substring(0, 1)) |> String.concat "")
yield! abbrs (before + word) words }
(abbrs "" words)
|> Seq.takeWhile (fun s -> s.Length <= charCount)
|> Seq.maxBy (fun s -> s.Length)
|> fun s ->
match padding with | None -> s | Some padchar -> pad s charCount padchar
To call it:
let ignored = let h = HashSet<string>(StringComparer.OrdinalIgnoreCase) in h.Add("the") |> ignore; h
["The Main Test Company"; "Main Test Company"; "The Main Company"; "The Company"; "Company"; "MainCompany"; "The MainCompany"; "MainTestCompany"; "SomeRX"; "SomeCompanyT"]
|> List.map (fun w -> w, (genAbbr ignored None 3 w))
Some possibilities:
- Use
HashSet
with a case insensitiveStringComparer
for the ignore list to reduce case sensitive code(e.g., Contains(s.ToLower())
- Use
Regex.Split()
for concision - Use lazy sequences to generate stream of answers to consider
- Separate some of the steps, so it is easier to reason about
- Look for possibilities to use something other than
FSharp.Core.List
, because having to prepend and then reverse is often accidental rather than essential complexity - Split out the padding thing to a separate function, since it seems unrelated to abbreviating (not done in code below)
For your consideration:
open System.Text.RegularExpressions
open System.Collections.Generic
let genAbbr (ignoreList:HashSet<string>) padding charCount (str : string) =
if str.Length <= charCount then
str.ToUpper() +
match padding with
| Some c -> String(Char.ToUpper(c), charCount - str.Length)
| None -> ""
else
let words = Regex.Split(str, @"(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|\s+")
let filtered = words |> Array.filter (fun w -> not (ignoreList.Contains(w)))
if filtered.Length >= charCount then
filtered |> Array.map (Seq.head >> Char.ToUpper)|> Array.take charCount |> String
else if words.Length >= charCount then
words |> Array.map Seq.head |> Array.take charCount |> String
else
// Expand words with all capital letters
let words =
[ for w in words do
if Seq.forall (Char.IsUpper) w then
for c in w do yield String(c, 1)
else yield w.ToUpper() ]
// Given a list of words, generate a sequence of abbreviations
let rec abbrs before (words: string list) : string seq = seq {
match words with
| [] -> ()
| word::words ->
for i = 1 to word.Length do
let before = before + word.Substring(0, i)
yield before + (words |> List.map (fun w -> w.Substring(0, 1)) |> String.concat "")
yield! abbrs (before + word) words }
(abbrs "" words)
|> Seq.takeWhile (fun s -> s.Length <= charCount)
|> Seq.maxBy (fun s -> s.Length)
let ignored = let h = HashSet<string>(StringComparer.OrdinalIgnoreCase) in h.Add("the") |> ignore; h
["The Main Test Company"; "Main Test Company"; "The Main Company"; "The Company"; "Company"; "MainCompany"; "The MainCompany"; "MainTestCompany"; "SomeRX"; "SomeCompanyT"]
|> List.map (fun w -> w, (genAbbr ignored None 3 w))
|> fun xs -> xs.Dump()
-
\$\begingroup\$ The only thing about this I don't like is the Regex, but the version I had used code that was just as complex so I suppose I'll live. \$\endgroup\$Der Kommissar– Der Kommissar2018年01月11日 01:46:02 +00:00Commented Jan 11, 2018 at 1:46