3
\$\begingroup\$

I'm working on a project where I need to map XML values to field names. Classic CS problem, I think I've got a pretty good approach but would like feedback.

For example: breachReportType matches to Breach Report Type:. However, there are similar matches like: breachType or breachReportCause.

The inputs are like ...

  • Xml by Field Name: <xml><breachReportType>
  • Field Name by ID: [ { Key: '1234', Value: 'Breach Report Type' } ]
  • Content by Field ID: <xml><Field id='1234' Value='Email'>

    // this.MapValues //->
    // Fields Example: [{ Key: '12345', Value: 'Breach Report Type' }]
    // XML Example: '<breachReportType />' 
    private Dictionary<string, string> MapValues(Dictionary<string, string> fields, string xml)
    {
     var map = new Dictionary<string, string>();
     var elements = XDocument.Parse(xml).Descendants();
     foreach (XElement elm in elements)
     {
     string name = elm.Name.ToString();
     string formattedName = this.SplitCamelCase(name);
     int currentMaxCount = 0;
     string bestMatchId = null;
     foreach (var field in fields)
     {
     string strippedFieldValue = field.Value.Substring(0, field.Value.Length - 1);
     // XML: breach Report Type
     // Field Name: Breach Type
     int match = this.MatchStrings(strippedFieldValue, formattedName);
     // once we find a match, update the index
     if (match > currentMaxCount)
     {
     if (bestMatchId != null)
     {
     // get the previous one
     string prevName;
     fields.TryGetValue(bestMatchId, out prevName);
     // compare the previous match
     int prev = this.MatchStrings(prevName, formattedName);
     // if the new match is greater than previous, update best match
     // and the string match is closer to the character match
     if (match > prev && ((strippedFieldValue.Length - match) < (prevName.Length - prev)))
     {
     bestMatchId = field.Key;
     currentMaxCount = match;
     }
     }
     else
     {
     bestMatchId = field.Key;
     currentMaxCount = match;
     }
     }
     }
     if (bestMatchId != null && !map.ContainsKey(bestMatchId))
     {
     map.Add(bestMatchId, name);
     }
     }
     return map;
    }
    // MatchStrings //->
    // firstString Example: 'breachReportType'
    // secondString Example: 'Breach Report Type'
    private int MatchStrings(string firstString, string secondString)
    {
     List<string> firstArr = firstString.Split(new[] { " " },
     StringSplitOptions.RemoveEmptyEntries).ToList();
     List<string> secondArr = secondString.Split(new[] { " " },
     StringSplitOptions.RemoveEmptyEntries).ToList();
     int match = 0;
     foreach (string string1 in firstArr)
     {
     foreach (string string2 in secondArr)
     {
     if (string1.ToLower().IndexOf(string2.ToLower()) > -1)
     {
     match = match + string2.Length;
     }
     }
     }
     return match;
    }
    // TranslateAndConcert //->
    // Record Example: "<xml><Field id="1234" Value="Email">"
    // FieldMap Example: [{ Key: '1234', 'breachReportType' }]
    private string TranslateAndConvert(string record, Dictionary<string, string> fieldMap)
    {
     XDocument mappedXml = new XDocument(new XElement("topmostSubform"));
     var elements = XDocument.Parse(XDocument.Parse(record).Root.Value)
     .Descendants("Field")
     .ToList();
     foreach (XElement elm in elements)
     {
     if (elm.Attribute("id") != null && elm.Attribute("value") != null)
     {
     // get the values from the xml
     string id = elm.Attribute("id").Value;
     string value = elm.Attribute("value").Value;
     // get the mapped name
     string name = null;
     fieldMap.TryGetValue(id, out name);
     // create new element
     if (name != null)
     {
     var node = new XElement(name, value);
     mappedXml.Root.Add(node);
     }
     }
     }
     return mappedXml.ToString();
    }
    // SplitCamelCase //->
    // input example: 'breachReportType'
    private string SplitCamelCase(string input)
    {
     return System.Text.RegularExpressions.Regex.Replace(input, "([A-Z])", 
     " 1ドル", System.Text.RegularExpressions.RegexOptions.Compiled).Trim();
    }
    

Any better ideas for an approach?

asked Jul 26, 2013 at 14:13
\$\endgroup\$
0

1 Answer 1

1
\$\begingroup\$

You could possibly improve it by using a fuzzy match algorithm such as the Levenshtein Distance Algorithm for your MatchStrings method. Then you could simply look for the smallest return value of all of the comparisons and use that as the best without having to do the extra length comparisons. This would also work better if there are some spelling variations in the data.

answered Jul 27, 2013 at 2:44
\$\endgroup\$
1
  • \$\begingroup\$ Ya, I really like that. Thanks for the suggestion! \$\endgroup\$ Commented Jul 29, 2013 at 13:19

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.