I'm trying to categorize file names of anime episodes (for now) into appropriate title based categories. The Show-titles are parsed from an XML file (I got from Anime News Network).
EDIT: The passed on backgroundWorker
object type variable is just to facilitate GUI update as this algorithm runs on a secondary thread.
DESCRIPTION: The algorithm itself is pretty straightforward, it finds matches for each keyword in the file name with a set list of show-titles and assigns a score corresponding to the show-title. After all the keywords are checked for matches, the show-title with the highest score is given as the category
to the file.
There are a few tweaks to boost the score in certain cases as well, like if the directory names match with the same show-title and if the score and the show-title length are almost the same number.
ADDITION: Class level variable declarations for context
OpenDialogView openDialog;
List<DirectoryInfo> dirList;
BackgroundWorker AnalyzerThread;
// Analyzer Components
public static char[] removablesNum = new char[] { '.', '_', '-', ' ', '^', '!', '@', '#', '$', '%', '&', '*', '~', '`', '?', '(', ')', '[', ']', '{', '}', '+', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' };
public static char[] removables = new char[] { '.', '_', '-', ' ', '^', '!', '@', '#', '$', '%', '&', '*', '~', '`', '?', '(', ')', '[', ']', '{', '}', '+' };
public static string animeDBPath = "ANN_AnimeDB_20-12-2015.xml";
public string parentPath, outputPath;
public List<string> titles;
public List<Children> notSortedFiles;
public List<Category> sortedFiles;
CODE:
#region Analyzer
private void RunAnalysis(object backgroundWorker)
{
titles = LoadXML(animeDBPath, "item", "name");
List<DirectoryInfo> dirs = new List<DirectoryInfo>();
List<FileInfo> allFiles = new List<FileInfo>();
// Find all directories
foreach (DirectoryInfo d in dirList)
{
dirs.AddRange(d.GetDirectories("*", SearchOption.AllDirectories));
}
// Add the parent directory as well
dirs.AddRange(dirList);
// Find all the files
foreach (DirectoryInfo dir in dirs)
{
allFiles.AddRange(dir.EnumerateFiles());
}
sortedFiles = SortFiles(allFiles, backgroundWorker);
}
private List<Category> SortFiles(List<FileInfo> allFiles, object backgroundWorker)
{
List<Category> categories = new List<Category>();
int fileCount = 0;
foreach (FileInfo file in allFiles)
{
fileCount++;
string[] subStrings = file.Name.Split(removables, StringSplitOptions.RemoveEmptyEntries);
// score holds a value for each title, highest score indicates closer match
int[] score = new int[titles.Count];
bool hasAScore = false;
// list's length - 1 to avoid extensions from being checked
for (int i = 0; i < titles.Count; i++)
{
for (int j = 0; j < subStrings.Length - 1; j++)
{
// @\b defines the match to be specific to whole words
if (Regex.IsMatch(titles[i], @"\b" + subStrings[j] + @"\b", RegexOptions.IgnoreCase))
{
// If a match is found, check the directory paths to enforce the match
foreach (string s in file.Directory.Name.Split(removables, StringSplitOptions.RemoveEmptyEntries))
{
if (Regex.IsMatch(titles[i], @"\b" + s + @"\b", RegexOptions.IgnoreCase))
{
score[i]++;
}
}
score[i]++;
hasAScore = true;
// Console.WriteLine("Found match with title '{0}' with string '{1}' from file '{2}'", titles[j], subStrings[i], file.Name);
}
}
// if the percentage of word matches and total words in the title is > 80% (arbitrary)
// To avoid false matches with longer titles
// boost the score
int titleWordCount = titles[i].Split(removables, StringSplitOptions.RemoveEmptyEntries).Length;
if ((100 * (score[i]) / (2 * titleWordCount)) > 80)
{
score[i] += 2;
}
}
if (hasAScore)
{
// Find the highest score in the list and use it's title value as the title of the Category
string titleName = titles[Array.IndexOf(score, score.Max())];
bool exists = false;
// Check through all the categories if it already exists, otherwise add a new one
// TODO perhaps check this in the class's constructor
foreach (Category c in categories)
{
if (c.Name == titleName)
{
c.AddChildren(new Children(file), titleName);
exists = true;
break;
}
}
if (!exists)
{
categories.Add(new Category(new Children(file), titleName));
}
}
else
{
// Files without a score were not matched with any existing category
notSortedFiles.Add(new Children(file));
}
// Console.WriteLine("File: '{0}' has a max score of {1}", file.Name, score.Max());
// Update Progress
// Send percentComplete to the backgroundWorker and the current file number
int progressPercentage = 100 * fileCount / allFiles.Count;
// Only the ReportProgress method can update the UI
(backgroundWorker as BackgroundWorker).ReportProgress(progressPercentage, fileCount);
}
return categories;
}
private List<string> LoadXML(string filePath, string descendant, string element)
{
// Load the db
XDocument db = XDocument.Load(@filePath);
List<string> titles = new List<string>();
var query = from c in db.Root.Descendants(descendant)
// Only do anime
select c.Element("type").Value == "TV" ? c.Element(element).Value : "null";
foreach (string animeName in query)
{
titles.Add(animeName);
}
// Sanitize "null" additions
titles.RemoveAll(value => value == "null");
titles.Sort();
titles = DeAccentTitles(titles);
return titles;
}
private List<string> DeAccentTitles(List<string> titlesToClean)
{
List<string> sanitizedTitles = new List<string>();
foreach (string s in titlesToClean)
{
string nfdNormalizedString = s.Normalize(NormalizationForm.FormD);
StringBuilder builder = new StringBuilder();
foreach (char c in nfdNormalizedString)
{
if(CharUnicodeInfo.GetUnicodeCategory(c) != UnicodeCategory.NonSpacingMark)
{
builder.Append(c);
}
}
sanitizedTitles.Add(builder.ToString().Normalize(NormalizationForm.FormC));
}
return sanitizedTitles;
}
#endregion
Sample File names:
ShingekinoKyojinOVA-01(480p)[Hatsuyuki-Kaitou][D8E8CC75].mkv -- Expected Category "Shingeki no Kyojin"
(Hi10)_Gosick_-_22_The_Christmas_Carol_Adorns_the_Happiness_by_the_Window_(BD_720p)_(Broken).mkv -- Expected Category "Gosick"
Manyuu.Hiken-chou.04.HD.BD.Kira.ACB.mkv -- Expected Category "Manyu Hiken-cho"
Commie_Steins Gate 01 Prologue to the Beginning and End.mkv -- Expected Category "Steins Gate"
Commie_Steins_Gate_02_BD_720p_AnimeKens.com.mkv -- Expected Category "Steins Gate"
Extract from the Source XML File:
<report>
<args>
<type>anime</type>
<name></name>
<search></search>
</args>
<item>
<id>17938</id>
<gid>721551383</gid>
<type>ONA</type>
<name>Koyomimonogatari</name>
<precision>ONA</precision>
<vintage>2016年01月09日</vintage>
</item>
<item>
<id>17937</id>
<gid>1319318627</gid>
<type>TV</type>
<name>Qualidea Code</name>
<precision>TV</precision>
</item>
</report>
I have a related question on SO regarding the approach to this kind of categorization problem (if you need more context).
EDIT: My code runs fine but as mentioned above, produces surprising false-positives. It's also quite slow, that's why I'm looking for a code review to improve it's performance and possibly realize the cause of the false-positives.
By the way, since this is a statistical algorithm, a few wrong results doesn't mean it's not working. It's working to an acceptable standard and improving correct prediction rate is not the primary concern.
-
1\$\begingroup\$ It's unclear to me what exactly you are asking for. Can you please elaborate? \$\endgroup\$Der Kommissar– Der Kommissar2016年01月06日 17:20:38 +00:00Commented Jan 6, 2016 at 17:20
-
3\$\begingroup\$ Welcome to Code Review. We review fully working code and make it better. Does this apply to the code you posted? Are you looking for a code review? If your code is not working as you want, or you're not looking for code review, then your question is off-topic. Please read this help center page for more details. \$\endgroup\$janos– janos2016年01月06日 18:06:27 +00:00Commented Jan 6, 2016 at 18:06
-
\$\begingroup\$ My code runs fine, but is a quite slow. I was looking for ways to improve it. I've edited the question accordingly. Sorry about the confusion. \$\endgroup\$Paras– Paras2016年01月07日 05:53:56 +00:00Commented Jan 7, 2016 at 5:53
-
\$\begingroup\$ could you please give us a sample of the xml please? \$\endgroup\$Robert Snyder– Robert Snyder2016年01月07日 13:55:55 +00:00Commented Jan 7, 2016 at 13:55
-
\$\begingroup\$ I have added an extract from the XML file, otherwise the link to the XML (in the question) leads to the same file. \$\endgroup\$Paras– Paras2016年01月07日 14:20:08 +00:00Commented Jan 7, 2016 at 14:20
1 Answer 1
Don't have time to look at SortFiles
more closely right now, so just some general remarks:
I found it useful for readability to prefix private class members with
_
so make them easily distinguishable from local variables and parameters. I was stumped a few times where a particular variable came from because it was neither declared nor a parameter so I guessed that it must be a class member - having a clear visual queue help. However that's a personal style preference so YMMV.Not sure why you have prefixed
filePath
with an@
when you use it. This is only necessary when you want to name a variable the same name as a C# keyword likethis
(so you could have a local variable called@this
for example).You shouldn't use
as
like this:(backgroundWorker as BackgroundWorker).ReportProgress(progressPercentage, fileCount);
Either you're sure it's a
BackgroundWorker
then use a direct cast or you're not sure then useas
plus anull
check. The way it stands you might get aNullReferenceException
which is usually not very helpful (since technically this could be thrown in a lot of different places). With a direct cast you get at least anInvalidCastException
which tells you a lot more about what may have caused it.DeAccentTitles
should not have to deal with a collection but should just deal with an individual title - it should not concern itself with the requirement that you want to do this for a whole collection of strings. It can also be condensed with the use of LINQ:private string DeAccentTitle(string title) { var chars = s.Normalize(NormalizationForm.FormD) .Where(c => CharUnicodeInfo.GetUnicodeCategory(c) != UnicodeCategory.NonSpacingMark) .ToArray() return new string(chars).Normalize(NormalizationForm.FormC) }
Based on the above point
LoadXML
can be condensed as well. Especially thenull
handling should just disappear:private List<string> LoadXML(string filePath, string descendant, string element) { return XDocument.Load(filePath) .Root .Descendants(descendant) .Where(c => c.Element("type").Value == "TV") .Select(c.Element(element).Value) .OrderBy(v => v) .Select(DeAccentTitle) .ToList(); }
Also
RunAnalysis
can be improved. I would also make that the point where thebackgroundWorker
is cast to it's actual type.private void RunAnalysis(object backgroundWorker) { titles = LoadXML(animeDBPath, "item", "name"); var allFiles = dirList.SelectMany(d => d.GetDirectories("*", SearchOption.AllDirectories)) .SelectMany(d => d.EnumerateFiles()) .ToList(); sortedFiles = SortFiles(allFiles, (BackgroundWorker)backgroundWorker); }
Update for the SortFiles
method
A few things to improve here:
You split strings in the same way three times - I find it cleaner to extract this into a common method to perform that action.
I would extract the scoring of a single title against a specific file name into it's own method - this encapsulates the core scoring logic and then lets you deal with the scoring of all file names in a more condensed way.
After you've done 1 and 2 you can apply some LINQ magic again to make it more succinct.
I don't have the code of the
Category
class but since theCategory.Name
seems to be linked to the title name it seems weird that you'd have to pass this in for every file you add. The code would become a bit cleaner if this is tidied up - I haven't that in the below code yet though.
The refactored code for the SortFiles
looks like this:
private string[] SplitByRemovables(string value)
{
return value.Split(removables, StringSplitOptions.RemoveEmptyEntries);
}
private int ScoreTitle(string title, string[] filenameParts, string[] directoryParts)
{
var score = filenameParts.Count(p => Regex.IsMatch(title, @"\b" + p + @"\b", RegexOptions.IgnoreCase));
if (score > 0)
{
score += directoryParts.Count(p => Regex.IsMatch(title, @"\b" + p + @"\b", RegexOptions.IgnoreCase));
}
// if the percentage of word matches and total words in the title is > 80% (arbitrary)
// To avoid false matches with longer titles boost the score
int titleWordCount = SplitByRemovables(title).Length;
if ((100 * score / (2 * titleWordCount)) > 80)
{
score += 2;
}
return score;
}
private List<Category> SortFiles(List<FileInfo> allFiles, BackgroundWorker backgroundWorker)
{
List<Category> categories = new List<Category>();
int fileCount = 0;
foreach (FileInfo file in allFiles)
{
fileCount++;
var filenameParts = SplitByRemovables(Path.GetFileNameWithoutExtension(file.Name));
var directoryParts = SplitByRemovables(file.Directory.Name);
var topTitle = titles.Select(t => new { Title = t, Score = ScoreTitle(t, filenameParts, directoryParts) })
.OrderByDescending(x => x.Score)
.First();
var childFile = new Children(file);
if (topTitle.Score > 0)
{
var category = categories.FirstOrDefault(c => c.Name == topTitle.Title);
if (category == null)
{
category = new Category(childFile, topTitle.Title);
categories.Add(category);
}
else
{
category.AddChildren(childFile, topTitle.Title);
}
}
else
{
// Files without a score were not matched with any existing category
notSortedFiles.Add(childFile);
}
// Update Progress
// Send percentComplete to the backgroundWorker and the current file number
int progressPercentage = 100 * fileCount / allFiles.Count;
// Only the ReportProgress method can update the UI
backgroundWorker.ReportProgress(progressPercentage, fileCount);
}
return categories;
}
Just saw in the comments to your question that you've identified some speed issues with the regular expressions. One potential change you could make is to replace those by simple string matching. The refactored code in this case would look like this:
private string[] SplitByRemovables(string value)
{
return value.Split(removables, StringSplitOptions.RemoveEmptyEntries);
}
private int ScoreTitle(string[] titleParts, string[] filenameParts, string[] directoryParts)
{
var score = filenameParts.Intersect(titleParts).Count();
if (score > 0)
{
score += directoryParts.Intersect.(titleParts).Count();
}
// if the percentage of word matches and total words in the title is > 80% (arbitrary)
// To avoid false matches with longer titles boost the score
if ((100 * score / (2 * titleParts.Length)) > 80)
{
score += 2;
}
return score;
}
private List<Category> SortFiles(List<FileInfo> allFiles, BackgroundWorker backgroundWorker)
{
List<Category> categories = new List<Category>();
int fileCount = 0;
var splitTitles = titles.Select(t => new { Title = t, Parts = SplitByRemovables(t) }).ToList();
foreach (FileInfo file in allFiles)
{
fileCount++;
var filenameParts = SplitByRemovables(Path.GetFileNameWithoutExtension(file.Name));
var directoryParts = SplitByRemovables(file.Directory.Name);
var topTitle = splitTitles.Select(t => new { Title = t.Title, Score = ScoreTitle(t.Parts, filenameParts, directoryParts) })
.OrderByDescending(x => x.Score)
.First();
var childFile = new Children(file);
if (topTitle.Score > 0)
{
var category = categories.FirstOrDefault(c => c.Name == topTitle.Title);
if (category == null)
{
category = new Category(childFile, topTitle.Title);
categories.Add(category);
}
else
{
category.AddChildren(childFile, topTitle.Title);
}
}
else
{
// Files without a score were not matched with any existing category
notSortedFiles.Add(childFile);
}
// Update Progress
// Send percentComplete to the backgroundWorker and the current file number
int progressPercentage = 100 * fileCount / allFiles.Count;
// Only the ReportProgress method can update the UI
backgroundWorker.ReportProgress(progressPercentage, fileCount);
}
return categories;
}
-
\$\begingroup\$ Thanks for the great advice! I'm still new to LINQ so am not familiar with 'Best Practices'. And I thought the
@
prefix infilepath
is to avoid accidental C# keyword creations if a filename containedthis
etc. I could be wrong about this assumption though. \$\endgroup\$Paras– Paras2016年01月16日 17:11:38 +00:00Commented Jan 16, 2016 at 17:11 -
\$\begingroup\$ I have added class level variables declarations for more context. Should I paste the whole file instead? It has a few front-end related event handlers (~300 lines). \$\endgroup\$Paras– Paras2016年01月16日 18:47:13 +00:00Commented Jan 16, 2016 at 18:47
-
1\$\begingroup\$ @ParasDPain: You could post a follow up question with the complete code incorporating all the suggestions you deem useful from this one. \$\endgroup\$ChrisWue– ChrisWue2016年01月16日 21:26:02 +00:00Commented Jan 16, 2016 at 21:26
-
1\$\begingroup\$ @ParasDPain: I've updated my answer with the remaining code reviewed. \$\endgroup\$ChrisWue– ChrisWue2016年01月17日 19:33:49 +00:00Commented Jan 17, 2016 at 19:33
-
\$\begingroup\$ Thanks for the details! I will post a follow-up question after implementing these suggestions. Cheers \$\endgroup\$Paras– Paras2016年01月18日 04:23:47 +00:00Commented Jan 18, 2016 at 4:23