File parse.pl of Package perl-HTML-TagParser

#! /usr/bin/perl -w -t
#
# a small test script, that
# takes your favourite web pages and turns it into 
# a parsed perl data structure.

use strict;
use HTML::TagParser;
use Data::Dumper;
$Data::Dumper::Sortkeys = 1;

my $text = eval { local $/; open my $in, "<", shift; <$in>};

my $dom = HTML::TagParser->new($text);
my $r = {};
for my $e ($dom->getElementsByTagName('form'))
  {
    my $s = $e->subTree;
    my $sr = {};
    for my $se ($s->getElementsByTagName('input'))
      {
        store_element($sr, $se, 'input', 0);
      }
    for my $se ($s->getElementsByTagName('textarea'))
      {
        store_element($sr, $se, 'textarea', 1);
      }

    store_element($r, $e, 'form', $sr);
  }

for my $e ($dom->getElementsByTagName('a'))
  {
    my $attr = $e->attributes;
    next unless defined $attr->{href};
    store_element($r, $e, 'href', 0);
  }

die Dumper $r;
exit 0;

##############################################

sub store_element
{
  my ($r, $e, $tn, $container) = @_;
  my $attr = $e->attributes;
  my $inp =
    {
      tagname => $e->tagName,
      text => $e->innerText()||'',
    };
  delete $inp->{text} unless length $inp->{text};
  delete $attr->{'/'};
  if ($container)
    {
      $inp->{attr} = $attr;
      if (ref $container)
        {
	  for my $c (keys %$container) { $inp->{$c} = $container->{$c} }
	}
    }
  else
    {
      ## inline attr.
      for my $a (keys %$attr) { $inp->{$a} = $attr->{$a} }
    }

  push @{$r->{$tn}}, $inp;
  for my $a qw(id name type)
    {
      push @{$r->{$tn.'_by_'.$a}{$attr->{$a}}}, $inp 
	if defined $attr->{$a};
    }
}

openSUSE Build Service is sponsored by