8
\$\begingroup\$

Got bored writing a review on an HTML parser and decided I wanted to try.

So I threw this together to see I could parse an Amazon page.

curl -A 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X; en-US; rv:1.8.1.13) Gecko/20080313 Firefox' https://www.amazon.com | parser

Look what I found on amazons home page:

Comment: _
 .__(.)< (MEOW)
 \___)
 ~~~~~~~~~~~~~~~~~~

Looks like a duck cat!!!

Note: This is not designed to parse valid HTML. The idea was to parse invalid HTML that is found on the web. So it makes allowances for a couple of common problems found in HTML that you see on the web.

It also assumes (incorrectly) that all text between <script> => </script> and <style> => </style> is one big blob of text.

parser.h

#ifndef THORSANVIL_HTMLPARSER_PARSER_H
#define THORSANVIL_HTMLPARSER_PARSER_H
#include <string>
#include <map>
#include <istream>
namespace ThorsAnvil
{
 namespace HTMLParser
 {
using Attributes = std::map<std::string, std::string>;
class HTMLTokenI
{
 public:
 virtual ~HTMLTokenI() {}
 // By default the functions deliberately do nothing.
 virtual void DocType(std::string const& docString) {}
 virtual void tagOpen(std::string const& tagName, Attributes const& attr) {}
 virtual void tagOpenClose(std::string const& tagName, Attributes const& attr) {}
 virtual void tagClose(std::string const& tagName) {}
 virtual void comment(std::string const& comment) {}
 virtual void text(std::string const& text) {}
 virtual void error(std::string const& message) {}
};
class HTMLSaxParser
{
 std::istream& htmlpage;
 HTMLTokenI& callback;
 public:
 HTMLSaxParser(std::istream& htmlpage, HTMLTokenI& callback)
 : htmlpage(htmlpage)
 , callback(callback)
 {}
 void parse();
 private:
 void parseDocType();
 void parseTag();
 void parseComment();
 void parseTagClose();
 void parseTagOpen();
 bool attributesFinished;
 Attributes readAttributes();
 bool getAttribute(std::istream& s, std::string& attr);
 void getNonHtmlText(std::string const& tag);
};
 }
}
#endif

parser.cpp

#include "parser.h"
#include <cctype>
#include <vector>
using namespace ThorsAnvil::HTMLParser;
void HTMLSaxParser::parse()
{
 parseDocType();
 std::string text;
 while(std::getline(htmlpage, text, '<'))
 {
 if (!text.empty())
 {
 callback.text(text);
 }
 if (htmlpage.good())
 {
 parseTag();
 }
 }
}
void HTMLSaxParser::parseDocType()
{
 char firstChar;
 while(htmlpage.get(firstChar) && std::isspace(firstChar))
 {
 // ignore space
 }
 if (!htmlpage)
 {
 callback.error("Empty Page");
 return;
 }
 if (firstChar == '<')
 {
 char secondChar = htmlpage.get();
 if (!htmlpage)
 {
 callback.error("Bad page only contains '<'");
 return;
 }
 if (secondChar == '!')
 {
 std::string docType;
 std::getline(htmlpage, docType, '>');
 callback.DocType(docType);
 }
 else
 {
 htmlpage.unget();
 htmlpage.unget();
 }
 }
 else
 {
 htmlpage.unget();
 }
}
void HTMLSaxParser::parseTag()
{
 // Note this function is called after the␣
 // initial '<' has been removed from the stream.
 char nextChar;
 while (htmlpage.get(nextChar) && std::isspace(nextChar))
 {
 // common error is to place leading space inside a tag.
 // let us ignore it to be good citizens. This is not valid
 // in html but a lot of pages on the web have this issue.
 callback.error("Leading Space in Tag");
 }
 if (htmlpage)
 {
 if (nextChar == '!')
 {
 parseComment();
 }
 else if (nextChar == '/')
 {
 parseTagClose();
 }
 else
 {
 htmlpage.unget();
 parseTagOpen();
 }
 }
}
void HTMLSaxParser::parseComment()
{
 std::string comment;
 char nextChar1 = '0円';
 char nextChar2 = '0円';
 if (htmlpage.get(nextChar1) && nextChar1 == '-' && htmlpage.get(nextChar2) && nextChar2 == '-')
 {
 std::string commentPart;
 while(std::getline(htmlpage, commentPart, '>'))
 {
 if (commentPart.size() >= 2 && commentPart[commentPart.size() - 2] == '-' && commentPart[commentPart.size() - 1] == '-')
 {
 comment += commentPart.substr(0, commentPart.size() - 2);
 break;
 }
 comment += commentPart;
 comment += '>';
 }
 }
 else
 {
 if (nextChar1 != '-')
 {
 htmlpage.unget();
 }
 htmlpage.unget();
 std::getline(htmlpage, comment, '>');
 callback.error("Badly formed Comment");
 }
 callback.comment(comment);
}
void HTMLSaxParser::parseTagClose()
{
 std::string tag;
 std::getline(htmlpage, tag, '>');
 auto find = std::find_if(std::begin(tag), std::end(tag), [](char x){return std::isspace(x);});
 if (find != std::end(tag))
 {
 callback.error("Badly formed close");
 tag = tag.substr(0, std::distance(std::begin(tag), find));
 }
 callback.tagClose(tag);
}
void HTMLSaxParser::parseTagOpen()
{
 std::string tag;
 char nextCharacter;
 while(htmlpage.get(nextCharacter) && nextCharacter != '>' && nextCharacter != '/' && !std::isspace(nextCharacter))
 {
 tag += std::tolower(nextCharacter);
 }
 Attributes attributes;
 bool openClose = false;
 if (nextCharacter != '>' && nextCharacter != '/')
 {
 attributes = readAttributes();
 }
 if (nextCharacter == '>')
 {
 // Normal Valid Tag
 }
 else if (nextCharacter == '/')
 {
 // HTML5 or XHTML Open/Close tag
 openClose = true;
 if (!htmlpage.get(nextCharacter))
 {
 callback.error(std::string("Badly Formed Tag. Close File in tag: ") + tag);
 }
 else if (nextCharacter != '>')
 {
 callback.error(std::string("Badly Formed Tag. Expected '>' after '/' got: ") + nextCharacter + " in: " + tag);
 std::string ignore;
 std::getline(htmlpage, ignore, '>');
 }
 }
 static std::vector<std::string> selfClosingTags = {"area", "base", "br", "col", "embed", "hr", "img", "input", "keygen", "link", "meta", "param", "source", "track", "wbr"};
 openClose = openClose || std::find(std::begin(selfClosingTags), std::end(selfClosingTags), tag) != std::end(selfClosingTags);
 if (openClose)
 {
 callback.tagOpenClose(tag, attributes);
 }
 else
 {
 callback.tagOpen(tag, attributes);
 if (tag == "script" || tag == "style")
 {
 // The content inside these tags is not HTML
 // So search for the closing tag now rather
 // than use the generic routines.
 getNonHtmlText(tag);
 }
 }
}
void HTMLSaxParser::getNonHtmlText(std::string const& tag)
{
 std::string program;
 std::string part;
 while(std::getline(htmlpage, part, '<'))
 {
 char nextChar;
 if (htmlpage.get(nextChar) && nextChar == '/')
 {
 bool fail = false;
 int loop;
 for(loop = 0; loop < tag.size(); ++loop)
 {
 if (!htmlpage.get(nextChar) || nextChar != tag[loop])
 {
 fail = true;
 break;
 }
 }
 for(;loop >= 0; --loop)
 {
 htmlpage.unget();
 }
 if (!fail)
 {
 htmlpage.unget();
 htmlpage.unget();
 break;
 }
 }
 program += part;
 program += '<';
 program += nextChar;
 }
 program += part;
 callback.text(program);
}
bool HTMLSaxParser::getAttribute(std::istream& s, std::string& attr)
{
 attr.clear();
 if (attributesFinished)
 {
 return false;
 }
 bool result = false;
 char nextChar;
 while(s.get(nextChar) && std::isspace(nextChar))
 {
 // ignore space
 }
 s.unget();
 while(s.get(nextChar) && nextChar != '>' && nextChar != '/' && !std::isspace(nextChar))
 {
 result = true;
 attr += nextChar;
 if (nextChar == '"' || nextChar == '\'')
 {
 char quote = nextChar;
 while(s.get(nextChar) && nextChar != quote)
 {
 attr += nextChar;
 }
 attr += nextChar;
 }
 }
 if (nextChar == '>' || nextChar == '/')
 {
 attributesFinished = true;
 s.unget();
 }
 return result;
}
Attributes HTMLSaxParser::readAttributes()
{
 Attributes result;
 attributesFinished = false;
 std::string attribute;
 while(getAttribute(htmlpage, attribute))
 {
 std::string key;
 std::string value;
 auto equalSign = attribute.find('=');
 if (equalSign == std::string::npos)
 {
 callback.error(std::string("Badly Defined Attribute: ") + attribute);
 key = attribute;
 }
 else
 {
 key = attribute.substr(0, equalSign);
 value = attribute.substr(equalSign + 1);
 if (value[0] == '"' || value[0] == '\'')
 {
 if (value[value.size() - 1] == value[0])
 {
 value = value.substr(1, value.size() - 2);
 }
 else
 {
 callback.error(std::string("Badly quotes Attribute Value: Key=") + key + " value=" + value );
 }
 }
 }
 std::transform(std::begin(key), std::end(key), std::begin(key), [](char x){return std::tolower(x);});
 result[key] = value;
 }
 return result;
}

main.cpp

#include "parser.h"
#include <iostream>
#include <fstream>
namespace tap = ThorsAnvil::HTMLParser;
class SimpleParser: public tap::HTMLTokenI
{
 void printLinkTags(std::string const& tagName, tap::Attributes const& attr)
 {
 tap::Attributes::const_iterator find;
 if (tagName == "a" && (find = attr.find("href")) != attr.end())
 {
 std::cout << "A: Link: " << find->second << "\n";
 }
 }
 public:
 void tagOpen(std::string const& tagName, tap::Attributes const& attr) override
 {
 printLinkTags(tagName, attr);
 }
 void tagOpenClose(std::string const& tagName, tap::Attributes const& attr) override
 {
 printLinkTags(tagName, attr);
 }
 void error(std::string const& message) override
 {
 std::cout << "Error: " << message << "\n";
 }
 void comment(std::string const& comment) override
 {
 std::cout << "Comment: " << comment << "\n";
 }
};
int main()
{
 std::ifstream amazon("t1");
 SimpleParser parser;
 tap::HTMLSaxParser sax(amazon, parser);
 sax.parse();
}
Phrancis
20.5k6 gold badges69 silver badges155 bronze badges
asked Aug 7, 2016 at 22:17
\$\endgroup\$
3
  • \$\begingroup\$ What standard are you using, 03, 11, 14? \$\endgroup\$ Commented Aug 7, 2016 at 23:49
  • \$\begingroup\$ @pacmaninbw: The current one: C++14 \$\endgroup\$ Commented Aug 7, 2016 at 23:52
  • \$\begingroup\$ The link will be useful to everyone who doesn't have any idea what is SAX parser. \$\endgroup\$ Commented Aug 8, 2016 at 17:44

1 Answer 1

3
\$\begingroup\$

Minor nitpicks, I suppose.

  1. You need to add

    #include <algorithm>
    

    to use std::find and std::find_if.

  2. Make loop an unsigned type.

    You have

    int loop;
    for(loop = 0; loop < tag.size(); ++loop)
    

    That produces a compiler warning from g++:

    warning: comparison between signed and unsigned integer expressions [-Wsign-compare]

    I suggest changing that to:

    decltype(tag.size()) loop;
    for(loop = 0; loop < tag.size(); ++loop)
    
  3. Example usage and real usage don't match.

    Your example usage says:

    curl -A '.....' www.amazon.com | parser
    

    However, in main you are using a hard coded filename, named "t1".

    std::ifstream amazon("t1");
    

    The usage should be:

    curl -A '.....' www.amazon.com > t1 && ./parser
    
answered Aug 20, 2016 at 6:53
\$\endgroup\$

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.