File parse.pl of Package perl-HTML-TagParser
#! /usr/bin/perl -w -t
#
# a small test script, that
# takes your favourite web pages and turns it into
# a parsed perl data structure.
use strict;
use HTML::TagParser;
use Data::Dumper;
$Data::Dumper::Sortkeys = 1;
my $text = eval { local $/; open my $in, "<", shift; <$in>};
my $dom = HTML::TagParser->new($text);
my $r = {};
for my $e ($dom->getElementsByTagName('form'))
{
my $s = $e->subTree;
my $sr = {};
for my $se ($s->getElementsByTagName('input'))
{
store_element($sr, $se, 'input', 0);
}
for my $se ($s->getElementsByTagName('textarea'))
{
store_element($sr, $se, 'textarea', 1);
}
store_element($r, $e, 'form', $sr);
}
for my $e ($dom->getElementsByTagName('a'))
{
my $attr = $e->attributes;
next unless defined $attr->{href};
store_element($r, $e, 'href', 0);
}
die Dumper $r;
exit 0;
##############################################
sub store_element
{
my ($r, $e, $tn, $container) = @_;
my $attr = $e->attributes;
my $inp =
{
tagname => $e->tagName,
text => $e->innerText()||'',
};
delete $inp->{text} unless length $inp->{text};
delete $attr->{'/'};
if ($container)
{
$inp->{attr} = $attr;
if (ref $container)
{
for my $c (keys %$container) { $inp->{$c} = $container->{$c} }
}
}
else
{
## inline attr.
for my $a (keys %$attr) { $inp->{$a} = $attr->{$a} }
}
push @{$r->{$tn}}, $inp;
for my $a qw(id name type)
{
push @{$r->{$tn.'_by_'.$a}{$attr->{$a}}}, $inp
if defined $attr->{$a};
}
}