HTML Tag Parsing

24,498

You could use IHTMLDocument2 DOM to parse whatever elements you need from the HTML:

uses ActiveX, MSHTML;

const
  HTML =
  '<div class="tvRow tvFirst hasLabel tvFirst" title="example1">' +
  '<label class="tvLabel">Name:</label>' +
  '<span class="tvValue">Value</span>' +
  '<div class="clear"></div>' +
  '</div>';

procedure TForm1.Button1Click(Sender: TObject);
var
  doc: OleVariant;
  el: OleVariant;
  i: Integer;
begin
  doc := coHTMLDocument.Create as IHTMLDocument2;
  doc.write(HTML);
  doc.close;
  ShowMessage(doc.body.innerHTML);
  for i := 0 to doc.body.all.length - 1 do
  begin
    el := doc.body.all.item(i);
    if (el.tagName = 'LABEL') and (el.className = 'tvLabel') then
      ShowMessage(el.innerText);
    if (el.tagName = 'SPAN') and (el.className = 'tvValue') then
      ShowMessage(el.innerText);
  end;
end;

I wanted to mention another very nice HTML parser I found today: htmlp (Delphi Dom HTML Parser and Converter). It's not as flexible as the IHTMLDocument2 obviously, but it's very easy to work with, fast, free, and supports Unicode for older Delphi versions.

Sample usage:

uses HtmlParser, DomCore;

function GetDocBody(HtmlDoc: TDocument): TElement;
var
  i: integer;
  node: TNode;
begin
  Result := nil;
  for i := 0 to HtmlDoc.documentElement.childNodes.length - 1 do
  begin
    node := HtmlDoc.documentElement.childNodes.item(i);
    if node.nodeName = 'body' then
    begin
      Result := node as TElement;
      Break;
    end;
  end;
end;

procedure THTMLForm.Button2Click(Sender: TObject);
var
  HtmlParser: THtmlParser;
  HtmlDoc: TDocument;
  i: Integer;
  body, el: TElement;
  node: TNode;
begin
  HtmlParser := THtmlParser.Create;
  try
    HtmlDoc := HtmlParser.parseString(HTML);
    try
      body := GetDocBody(HtmlDoc);
      if Assigned(body) then
        for i := 0 to body.childNodes.length - 1 do
        begin
          node := body.childNodes.item(i);
          if (node is TElement) then
          begin
            el := node as TElement;
            if (el.tagName = 'div') and (el.GetAttribute('class') = 'tvRow tvFirst hasLabel tvFirst') then
            begin
              // iterate el.childNodes here...
              ShowMessage(IntToStr(el.childNodes.length));
            end;
          end;
        end;
    finally
      HtmlDoc.Free;
    end;
  finally
    HtmlParser.Free
  end;
end;
Share:
24,498
Admin
Author by

Admin

Updated on December 25, 2020

Comments

  • Admin
    Admin over 3 years

    How can I parse Name: & Value text from within the tag with DIHtmlParser? I tried doing it with TCLHtmlParser from Clever Components but it failed. Second question is can DIHtmlParser parse individual tags for example loop through its sub tags. Its a total nightmare for such a simple problem.

    <div class="tvRow tvFirst hasLabel tvFirst" title="example1">
      <label class="tvLabel">Name:</label>
      <span class="tvValue">Value</span>
    <div class="clear"></div></div>
    
    <div class="tvRow tvFirst hasLabel tvFirst" title="example2">
      <label class="tvLabel">Name:</label>
      <span class="tvValue">Value</span>
    <div class="clear"></div></div>
    
  • Admin
    Admin over 11 years
    DIHtmlParser? Its a very complex parser. Nothing like TCLHtmlParser from Clever Components. I tried to use it and its a nightmare. Capability sure but usability? Horrible..
  • whosrdaddy
    whosrdaddy over 11 years
    @T0xic: sure it will work in a thread, don't forget to call CoInitialize first...