Initial Commit

This commit is contained in:
Riley Schneider
2025-12-03 16:38:10 +01:00
parent c5e26bf594
commit b732d8d4b5
17680 changed files with 5977495 additions and 2 deletions

View File

@@ -0,0 +1,690 @@
package HTML::Clean;
use Carp;
use IO;
use Fcntl;
use strict;
require 5.004;
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
require Exporter;
require AutoLoader;
# Items to export to callers namespace
@EXPORT = qw();
$VERSION = '0.8';
=head1 NAME
HTML::Clean - Cleans up HTML code for web browsers, not humans
=head1 SYNOPSIS
use HTML::Clean;
$h = new HTML::Clean($filename); # or..
$h = new HTML::Clean($htmlcode);
$h->compat();
$h->strip();
$data = $h->data();
print $$data;
=head1 DESCRIPTION
The HTML::Clean module encapsulates a number of common techniques for
minimizing the size of HTML files. You can typically save between
10% and 50% of the size of a HTML file using these methods.
It provides the following features:
=over 8
=item Remove unneeded whitespace (begining of line, etc)
=item Remove unneeded META elements.
=item Remove HTML comments (except for styles, javascript and SSI)
=item Replace tags with equivilant shorter tags (<strong> --> <b>)
=item etc.
=back
The entire proces is configurable, so you can pick and choose what you want
to clean.
=head1 THE HTML::Clean CLASS
=over 4
=cut
######################################################################
=head2 $h = new HTML::Clean($dataorfile, [$level]);
This creates a new HTML::Clean object. A Prerequisite for all other
functions in this module.
The $dataorfile parameter supplies the input HTML, either a filename,
or a reference to a scalar value holding the HTML, for example:
$h = new HTML::Clean("/htdocs/index.html");
$html = "<strong>Hello!</strong>";
$h = new HTML::Clean(\$html);
An optional 'level' parameter controls the level of optimization
performed. Levels range from 1 to 9. Level 1 includes only simple
fast optimizations. Level 9 includes all optimizations.
=cut
sub new {
my $this = shift;
my $class = ref($this) || $this;
my $self = {};
bless $self, $class;
my $data = shift;
my $level = shift;
if ($self->initialize($data)) {
# set the default level
$level = 9 if (!$level);
$self->level($level);
return $self;
} else {
undef $self;
return undef;
}
}
#
# Set up the data in the self hash..
#
=head2 $h->initialize($dataorfile)
This function allows you to reinitialize the HTML data used by the
current object. This is useful if you are processing many files.
$dataorfile has the same usage as the new method.
Return 0 for an error, 1 for success.
=cut
sub initialize {
my($self, $data) = @_;
$self->{'DATA'} = undef;
# Not defined? Just return true.
return(1) if (!$data);
# Check if it's a ref
if (ref($data)) {
$self->{DATA} = $data;
return(1);
}
# Newline char, really an error, but just go with it..
if ($data =~ /\n/) {
$self->{'DATA'} = \$data;
}
# No newline? Must be a filename
if (-f $data) {
my $storage;
sysopen(IN, "$data", O_RDONLY) || return(0);
while (<IN>) {
$storage .= $_;
}
close(IN);
$self->{'DATA'} = \$storage;
return(1);
}
return(0); # file not found?
}
=head2 $h->level([$level])
Get/set the optimization level. $level is a number from 1 to 9.
=cut
sub level {
my($self, $level) = @_;
if (defined($level) && ($level > 0) && ($level < 10)) {
$self->{'LEVEL'} = $level
}
return($self->{'LEVEL'});
}
=head2 $myref = $h->data()
Returns the current HTML data as a scalar reference.
=cut
sub data {
my($self) = @_;
return $self->{'DATA'};
}
# Junk HTML comments (INTERNAL)
sub _commentcheck($) {
my($comment) = @_;
$_ = $comment;
# Server side include
return($comment) if (m,^<!--\#,si);
# ITU Hack.. preserve some frontpage components
return($comment) if (m,^<!-- %,si);
return($comment) if (m,bot="(SaveResults|Search|ConfirmationField)",si);
# Javascript
return($comment) if (m,//.*-->$,si);
return($comment) if (m,navigator\.app(name|version),si);
# Stylesheet
return($comment) if (m,[A-z0-9]+\:[A-z0-9]+\s*\{.*\},si);
return('');
}
# Remove javascript comments (INTERNAL)
sub _jscomments {
my($js) = @_;
$js =~ s,\n\s*//.*?\n,\n,sig;
$js =~ s,\s+//.*?\n,\n,sig;
# insure javascript is hidden
if ($js =~ m,<!--,) {
$js =~ s,</script>,// -->\n</script>,si;
}
return($js);
}
# Clean up other javascript stuff..
sub _javascript {
my($js) = @_;
# remove excess whitespace at the beginning and end of lines
$js =~ s,\s*\n+\s*,\n,sig;
# braces/semicolon at end of line, join next line
$js =~ s,([;{}])\n,$1,sig;
# What else is safe to do?
return($js);
}
# replace #000000 -> black, etc..
# Does the browser render faster with RGB? You would think so..
sub _defcolorcheck ($) {
my($c) = @_;
$c =~ s/\#000000/black/;
$c =~ s/\#c0c0c0/silver/i;
$c =~ s/\#808080/gray/;
$c =~ s/\#ffffff/white/i;
$c =~ s/\#800000/maroon/;
$c =~ s/\#ff0000/red/i;
$c =~ s/\#800080/purple/;
$c =~ s/\#ff00ff/fuchsia/i;
$c =~ s/\#ff00ff/fuchsia/i;
$c =~ s/\#008000/green/;
$c =~ s/\#00ff00/lime/i;
$c =~ s/\#808000/olive/;
$c =~ s/\#ffff00/yellow/i;
$c =~ s/\#000080/navy/;
$c =~ s/\#0000ff/blue/i;
$c =~ s/\#008080/teal/i;
$c =~ s/\#00ffff/aqua/i;
return($c);
}
# For replacing entities with numerics
use vars qw/ %_ENTITIES/;
%_ENTITIES = (
'Agrave' => 192,
'Aacute' => 193,
'Acirc' => 194,
'Atilde' => 195,
'Auml' => 196,
'Aring' => 197,
'AElig' => 198,
'Ccedil' => 199,
'Egrave' => 200,
'Eacute' => 201,
'Ecirc' => 202,
'Euml' => 203,
'Igrave' => 204,
'Iacute' => 205,
'Icirc' => 206,
'Iuml' => 207,
'ETH' => 208,
'Ntilde' => 209,
'Ograve' => 210,
'Oacute' => 211,
'Ocirc' => 212,
'Otilde' => 213,
'Ouml' => 214,
'Oslash' => 216,
'Ugrave' => 217,
'Uacute' => 218,
'Ucirc' => 219,
'Uuml' => 220,
'Yacute' => 221,
'THORN' => 222,
'szlig' => 223,
'agrave' => 224,
'aacute' => 225,
'acirc' => 226,
'atilde' => 227,
'auml' => 228,
'aring' => 229,
'aelig' => 230,
'ccedil' => 231,
'egrave' => 232,
'eacute' => 233,
'ecirc' => 234,
'euml' => 235,
'igrave' => 236,
'iacute' => 237,
'icirc' => 238,
'iuml' => 239,
'eth' => 240,
'ntilde' => 241,
'ograve' => 242,
'oacute' => 243,
'ocirc' => 244,
'otilde' => 245,
'ouml' => 246,
'oslash' => 248,
'ugrave' => 249,
'uacute' => 250,
'ucirc' => 251,
'uuml' => 252,
'yacute' => 253,
'thorn' => 254,
'yuml' => 255
);
=head2 strip(\%options);
Removes excess space from HTML
You can control the optimizations used by specifying them in the
%options hash reference.
The following options are recognized:
=over 8
=item boolean values (0 or 1 values)
whitespace Remove excess whitespace
shortertags <strong> -> <b>, etc..
blink No blink tags.
contenttype Remove default contenttype.
comments Remove excess comments.
entities &quot; -> ", etc.
dequote remove quotes from tag parameters where possible.
defcolor recode colors in shorter form. (#ffffff -> white, etc.)
javascript remove excess spaces and newlines in javascript code.
htmldefaults remove default values for some html tags
lowercasetags translate all HTML tags to lowercase
=item parameterized values
meta Takes a space separated list of meta tags to remove,
default "GENERATOR FORMATTER"
emptytags Takes a space separated list of tags to remove when there is no
content between the start and end tag, like this: <b></b>.
The default is 'b i font center'
=back
=cut
use vars qw/
$do_whitespace
$do_shortertags
$do_meta
$do_blink
$do_contenttype
$do_comments
$do_entities
$do_dequote
$do_defcolor
$do_emptytags
$do_javascript
$do_htmldefaults
$do_lowercasetags
$do_defbaseurl
/;
$do_whitespace = 1;
$do_shortertags = 1;
$do_meta = "generator formatter";
$do_blink = 1;
$do_contenttype = 1;
$do_comments = 1;
$do_entities = 1;
$do_dequote = 1;
$do_defcolor = 1;
$do_emptytags = 'b i font center';
$do_javascript = 1;
$do_htmldefaults = 1;
$do_lowercasetags = 1;
$do_defbaseurl = '';
sub strip {
my($self, $options) = @_;
my $h = $self->{'DATA'};
my $level = $self->{'LEVEL'};
# Select a set of options based on $level, and then modify based on
# user supplied options.
_level_defaults($level);
if(defined($options)) {
no strict 'refs';
for (keys(%$options)) {
${"do_" . lc($_)} = $options->{$_} if defined ${"do_" . lc($_)};
}
}
if ($do_shortertags) {
$$h =~ s,<strong>,<b>,sgi;
$$h =~ s,</strong>,</b>,sgi;
$$h =~ s,<em>,<i>,sgi;
$$h =~ s,</em>,</i>,sgi;
}
if ($do_whitespace) {
$$h =~ s,[\r\n]+,\n,sg; # Carriage/LF -> LF
$$h =~ s,\s+\n,\n,sg; # empty line
$$h =~ s,\n\s+<,\n<,sg; # space before tag
$$h =~ s,\n\s+,\n ,sg; # other spaces
$$h =~ s,>\n\s*<,><,sg; # LF/spaces between tags..
# Remove excess spaces within tags.. note, we could parse out the elements
# and rewrite for excess spaces between elements. perhaps next version.
# removed due to problems with > and < in tag elements..
#$$h =~ s,\s+>,>,sg;
#$$h =~ s,<\s+,<,sg;
# do this again later..
}
if ($do_entities) {
$$h =~ s,&quot;,\",sg;
# Simplify long entity names if using default charset...
$$h =~ m,charset=([^\"]+)\",;
if (!defined($1) || ($1 eq 'iso-8859-1')) {
$$h =~ s,&([A-z]+);,($_ENTITIES{$1}) ? chr($_ENTITIES{$1}) : $&,sige;
}
}
if ($do_meta) {
foreach my $m (split(/\s+/, $do_meta)) {
$$h =~ s,<meta name="$m"[^>]*?>,,sig;
}
}
if ($do_contenttype) {
# Don't need this, since it is the default for most web servers
# Also gets rid of 'blinking pages' in older versions of netscape.
$$h =~ s,<meta http-equiv="Content-Type".*?content="text/html;.*?charset=iso-8859-1">,,sig;
}
if ($do_defcolor) {
$$h =~ s,(<[^<]+?color=['"]?\#[0-9A-Fa-f]+["']?),_defcolorcheck($&),sige;
}
if ($do_comments) {
# don't strip server side includes..
# try not to get javascript, or styles...
$$h =~ s,<!--.*?-->,_commentcheck($&),sige;
# Remove javascript comments
$$h =~ s,<script[^>]*(java|ecma)script[^>]*>.*?</script>,_jscomments($&),sige;
}
if ($do_javascript) {
#
$$h =~ s,<script[^>]*(java|ecma)script[^>]*>.*?</script>,_javascript($&),sige;
}
if ($do_blink) {
$$h =~ s,<BLINK>,,sgi;
$$h =~ s,</BLINK>,,sgi;
}
if ($do_dequote) {
while ($$h =~ s,<([A-z]+ [A-z]+=)(['"])([A-z0-9]+)\2(\s*?[^>]*?>),<$1$3$4,sig)
{
# Remove alphanumeric quotes. Note, breaks DTD..
;
}
}
# remove <b></b>, etc..
if ($do_emptytags) {
my $pat = $do_emptytags;
$pat =~ s/\s+/|/g;
while ($$h =~ s,<($pat)(\s+[^>]*?)?>\s*</\1>,,siog){}
}
if ($do_htmldefaults) {
# Tables
# seems to break things..
#$$h =~ s,(<table[^>]*)\s+border=0([^>]*>),$1$2,sig;
$$h =~ s,(<td[^>]*)\s+rowspan=1([^>]*>),$1$2,sig;
$$h =~ s,(<td[^>]*)\s+colspan=1([^>]*>),$1$2,sig;
#
# P, TABLE tags are default left aligned..
# lynx is inconsistent in this manner though..
$$h =~ s,<(P|table|td)( [^>]*)align=\"?left\"?([^>]*)>,<$1$2$3>,sig;
# OL start=1
$$h =~ s,(<OL [^>]*)start=\"?1\"?([^>]*>),$1$2,sig;
# FORM
$$h =~ s,(<form [^>]*)method=\"?get\"?([^>]*>),$1$2,sig;
$$h =~ s,(<form [^>]*)enctype=\"application/x-www-form-urlencoded\"([^>]*>),$1$2,sig;
# hr
$$h =~ s,(<hr [^>]*)align=\"?center\"?([^>]*>),$1$2,sig;
$$h =~ s,(<hr [^>]*)width=\"?100%\"?([^>]*>),$1$2,sig;
# URLs
$$h =~ s,(href|src)(=\"?http://[^/:]+):80/,$1$2/,sig;
}
if ($do_whitespace) {
# remove space within tags <center > becomes <center>
$$h =~ s,\s+>,>,sg;
$$h =~ s,<\s+,<,sg;
# join lines with a space at the beginning/end of the line
# and a line that begins with a tag
$$h =~ s,>\n ,> ,sig;
$$h =~ s, \n<, <,sig;
}
if ($do_lowercasetags) {
# translate tags to lowercase to (hopefully) improve compressability..
# simple tags <H1>, </H1> etc.
$$h =~ s,(<[/]?[a-zA-Z][a-zA-Z0-9_-]*\s*>),\L$1\E,sg;
# the rest..
$$h =~ s/(<[a-zA-Z][a-zA-Z0-9_-]*)(\s+.*?>)/_lowercasetag($1,$2)/sge;
}
}
sub _lowercasetag {
my($prefix, $body) = @_;
$prefix =~ s/^(.+)$/\L$1\E/;
$body =~ s/(\s+[a-zA-Z][a-zA-Z0-9_-]*)(\s*=\s*[^"\s]+|\s*=\s*"[^"]*"|>|\s)/\L$1\E$2/sg;
return $prefix.$body;
}
# set options based on the level provided.. INTERNAL
sub _level_defaults($) {
my ($level) = @_;
$do_whitespace = 1; # always do this...
# level 2
$do_shortertags = ($level > 1) ? 1 : 0;
$do_meta = ($level > 1) ? "generator formatter" : "";
$do_contenttype = ($level > 1) ? 1 : 0;
# level 3
$do_entities = ($level > 2) ? 1 : 0;
$do_blink = ($level > 2) ? 1 : 0;
# level 4
$do_comments = ($level > 3) ? 1 : 0;
$do_dequote = ($level > 3) ? 1 : 0;
$do_defcolor = ($level > 3) ? 1 : 0;
$do_emptytags = ($level > 3) ? 'b i font center' : 0;
$do_javascript = ($level > 3) ? 1 : 0;
$do_htmldefaults = ($level > 3) ? 1 : 0;
$do_lowercasetags = ($level > 3) ? 1 : 0;
# higher levels reserved for more intensive optimizations.
}
######################################################################
=head2 compat()
This function improves the cross-platform compatibility of your HTML.
Currently checks for the following problems:
=over 8
=item Insuring all IMG tags have ALT elements.
=item Use of Arial, Futura, or Verdana as a font face.
=item Positioning the <TITLE> tag immediately after the <head> tag.
=back
=cut
sub compat {
my($self, $level, $options) = @_;
my $h = $self->{'DATA'};
$$h =~ s/face="arial"/face="arial,helvetica,sansserif"/sgi;
$$h =~ s/face="(verdana|futura)"/face="$1,arial,helvetica,sansserif"/sgi;
# insure that <title> tag is directly after the <head> tag
# Some search engines only search the first N chars. (PLweb for instance..)
if ($$h =~ s,<title>(.*)</title>,,si) {
my $title = $1;
$$h =~ s,<head>,<head><title>$title</title>,si;
}
# Look for IMG without ALT tags.
$$h =~ s/(<img[^>]+>)/_imgalt($1)/segi;
}
sub _imgalt {
my($tag) = @_;
$tag =~ s/>/ alt="">/ if ($tag !~ /alt=/i);
return($tag);
}
=head2 defrontpage();
This function converts pages created with Microsoft Frontpage to
something a Unix server will understand a bit better. This function
currently does the following:
=over 8
=item Converts Frontpage 'hit counters' into a unix specific format.
=item Removes some frontpage specific html comments
=back
=cut
sub defrontpage {
my($self) = @_;
my $h = $self->{'DATA'};
while ($$h =~ s,<img\sSRC="[\./]*_vti_bin/fpcount.exe(/.*/).Page=(.*?)\|.*?\s(.*?)>,<img src="/counter?link=$1$2" $3>,xis) {
print "Converted a Hitcounter.. $1, $2, $3\n";
}
$$h =~ s,<!--(mstheme|msthemeseparator|msnavigation)-->,,sgx;
}
=back
=head1 SEE ALSO
=head2 Modules
FrontPage::Web, FrontPage::File
=head2 Web Sites
=over 6
=item Distribution Site - http://people.itu.int/~lindner/
=back
=head1 AUTHORS
Paul Lindner for the International Telecommunication Union (ITU)
=head1 COPYRIGHT
The HTML::Strip module is Copyright (c) 1998,99 by the ITU, Geneva Switzerland.
All rights reserved.
You may distribute under the terms of either the GNU General Public
License or the Artistic License, as specified in the Perl README file.
=cut
1;
__END__

View File

@@ -0,0 +1,500 @@
package HTML::Entities;
=head1 NAME
HTML::Entities - Encode or decode strings with HTML entities
=head1 SYNOPSIS
use HTML::Entities;
$a = "V&aring;re norske tegn b&oslash;r &#230res";
decode_entities($a);
encode_entities($a, "\200-\377");
For example, this:
$input = "vis-à-vis Beyoncé's naïve\npapier-mâché résumé";
print encode_entities($input), "\n"
Prints this out:
vis-&agrave;-vis Beyonc&eacute;'s na&iuml;ve
papier-m&acirc;ch&eacute; r&eacute;sum&eacute;
=head1 DESCRIPTION
This module deals with encoding and decoding of strings with HTML
character entities. The module provides the following functions:
=over 4
=item decode_entities( $string, ... )
This routine replaces HTML entities found in the $string with the
corresponding Unicode character. Under perl 5.6 and earlier only
characters in the Latin-1 range are replaced. Unrecognized
entities are left alone.
If multiple strings are provided as argument they are each decoded
separately and the same number of strings are returned.
If called in void context the arguments are decoded in-place.
This routine is exported by default.
=item _decode_entities( $string, \%entity2char )
=item _decode_entities( $string, \%entity2char, $expand_prefix )
This will in-place replace HTML entities in $string. The %entity2char
hash must be provided. Named entities not found in the %entity2char
hash are left alone. Numeric entities are expanded unless their value
overflow.
The keys in %entity2char are the entity names to be expanded and their
values are what they should expand into. The values do not have to be
single character strings. If a key has ";" as suffix,
then occurrences in $string are only expanded if properly terminated
with ";". Entities without ";" will be expanded regardless of how
they are terminated for compatibility with how common browsers treat
entities in the Latin-1 range.
If $expand_prefix is TRUE then entities without trailing ";" in
%entity2char will even be expanded as a prefix of a longer
unrecognized name. The longest matching name in %entity2char will be
used. This is mainly present for compatibility with an MSIE
misfeature.
$string = "foo&nbspbar";
_decode_entities($string, { nb => "@", nbsp => "\xA0" }, 1);
print $string; # will print "foo bar"
This routine is exported by default.
=item encode_entities( $string )
=item encode_entities( $string, $unsafe_chars )
This routine replaces unsafe characters in $string with their entity
representation. A second argument can be given to specify which characters to
consider unsafe. The unsafe characters is specified using the regular
expression character class syntax (what you find within brackets in regular
expressions).
The default set of characters to encode are control chars, high-bit chars, and
the C<< < >>, C<< & >>, C<< > >>, C<< ' >> and C<< " >> characters. But this,
for example, would encode I<just> the C<< < >>, C<< & >>, C<< > >>, and C<< "
>> characters:
$encoded = encode_entities($input, '<>&"');
and this would only encode non-plain ascii:
$encoded = encode_entities($input, '^\n\x20-\x25\x27-\x7e');
This routine is exported by default.
=item encode_entities_numeric( $string )
=item encode_entities_numeric( $string, $unsafe_chars )
This routine works just like encode_entities, except that the replacement
entities are always C<&#xI<hexnum>;> and never C<&I<entname>;>. For
example, C<encode_entities("r\xF4le")> returns "r&ocirc;le", but
C<encode_entities_numeric("r\xF4le")> returns "r&#xF4;le".
This routine is I<not> exported by default. But you can always
export it with C<use HTML::Entities qw(encode_entities_numeric);>
or even C<use HTML::Entities qw(:DEFAULT encode_entities_numeric);>
=back
All these routines modify the string passed as the first argument, if
called in a void context. In scalar and array contexts, the encoded or
decoded string is returned (without changing the input string).
If you prefer not to import these routines into your namespace, you can
call them as:
use HTML::Entities ();
$decoded = HTML::Entities::decode($a);
$encoded = HTML::Entities::encode($a);
$encoded = HTML::Entities::encode_numeric($a);
The module can also export the %char2entity and the %entity2char
hashes, which contain the mapping from all characters to the
corresponding entities (and vice versa, respectively).
=head1 COPYRIGHT
Copyright 1995-2006 Gisle Aas. All rights reserved.
This library is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.
=cut
use strict;
use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION);
use vars qw(%entity2char %char2entity);
require 5.004;
require Exporter;
@ISA = qw(Exporter);
@EXPORT = qw(encode_entities decode_entities _decode_entities);
@EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric);
$VERSION = "3.64";
sub Version { $VERSION; }
require HTML::Parser; # for fast XS implemented decode_entities
%entity2char = (
# Some normal chars that have special meaning in SGML context
amp => '&', # ampersand
'gt' => '>', # greater than
'lt' => '<', # less than
quot => '"', # double quote
apos => "'", # single quote
# PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML
AElig => chr(198), # capital AE diphthong (ligature)
Aacute => chr(193), # capital A, acute accent
Acirc => chr(194), # capital A, circumflex accent
Agrave => chr(192), # capital A, grave accent
Aring => chr(197), # capital A, ring
Atilde => chr(195), # capital A, tilde
Auml => chr(196), # capital A, dieresis or umlaut mark
Ccedil => chr(199), # capital C, cedilla
ETH => chr(208), # capital Eth, Icelandic
Eacute => chr(201), # capital E, acute accent
Ecirc => chr(202), # capital E, circumflex accent
Egrave => chr(200), # capital E, grave accent
Euml => chr(203), # capital E, dieresis or umlaut mark
Iacute => chr(205), # capital I, acute accent
Icirc => chr(206), # capital I, circumflex accent
Igrave => chr(204), # capital I, grave accent
Iuml => chr(207), # capital I, dieresis or umlaut mark
Ntilde => chr(209), # capital N, tilde
Oacute => chr(211), # capital O, acute accent
Ocirc => chr(212), # capital O, circumflex accent
Ograve => chr(210), # capital O, grave accent
Oslash => chr(216), # capital O, slash
Otilde => chr(213), # capital O, tilde
Ouml => chr(214), # capital O, dieresis or umlaut mark
THORN => chr(222), # capital THORN, Icelandic
Uacute => chr(218), # capital U, acute accent
Ucirc => chr(219), # capital U, circumflex accent
Ugrave => chr(217), # capital U, grave accent
Uuml => chr(220), # capital U, dieresis or umlaut mark
Yacute => chr(221), # capital Y, acute accent
aacute => chr(225), # small a, acute accent
acirc => chr(226), # small a, circumflex accent
aelig => chr(230), # small ae diphthong (ligature)
agrave => chr(224), # small a, grave accent
aring => chr(229), # small a, ring
atilde => chr(227), # small a, tilde
auml => chr(228), # small a, dieresis or umlaut mark
ccedil => chr(231), # small c, cedilla
eacute => chr(233), # small e, acute accent
ecirc => chr(234), # small e, circumflex accent
egrave => chr(232), # small e, grave accent
eth => chr(240), # small eth, Icelandic
euml => chr(235), # small e, dieresis or umlaut mark
iacute => chr(237), # small i, acute accent
icirc => chr(238), # small i, circumflex accent
igrave => chr(236), # small i, grave accent
iuml => chr(239), # small i, dieresis or umlaut mark
ntilde => chr(241), # small n, tilde
oacute => chr(243), # small o, acute accent
ocirc => chr(244), # small o, circumflex accent
ograve => chr(242), # small o, grave accent
oslash => chr(248), # small o, slash
otilde => chr(245), # small o, tilde
ouml => chr(246), # small o, dieresis or umlaut mark
szlig => chr(223), # small sharp s, German (sz ligature)
thorn => chr(254), # small thorn, Icelandic
uacute => chr(250), # small u, acute accent
ucirc => chr(251), # small u, circumflex accent
ugrave => chr(249), # small u, grave accent
uuml => chr(252), # small u, dieresis or umlaut mark
yacute => chr(253), # small y, acute accent
yuml => chr(255), # small y, dieresis or umlaut mark
# Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96)
copy => chr(169), # copyright sign
reg => chr(174), # registered sign
nbsp => chr(160), # non breaking space
# Additional ISO-8859/1 entities listed in rfc1866 (section 14)
iexcl => chr(161),
cent => chr(162),
pound => chr(163),
curren => chr(164),
yen => chr(165),
brvbar => chr(166),
sect => chr(167),
uml => chr(168),
ordf => chr(170),
laquo => chr(171),
'not' => chr(172), # not is a keyword in perl
shy => chr(173),
macr => chr(175),
deg => chr(176),
plusmn => chr(177),
sup1 => chr(185),
sup2 => chr(178),
sup3 => chr(179),
acute => chr(180),
micro => chr(181),
para => chr(182),
middot => chr(183),
cedil => chr(184),
ordm => chr(186),
raquo => chr(187),
frac14 => chr(188),
frac12 => chr(189),
frac34 => chr(190),
iquest => chr(191),
'times' => chr(215), # times is a keyword in perl
divide => chr(247),
( $] > 5.007 ? (
'OElig;' => chr(338),
'oelig;' => chr(339),
'Scaron;' => chr(352),
'scaron;' => chr(353),
'Yuml;' => chr(376),
'fnof;' => chr(402),
'circ;' => chr(710),
'tilde;' => chr(732),
'Alpha;' => chr(913),
'Beta;' => chr(914),
'Gamma;' => chr(915),
'Delta;' => chr(916),
'Epsilon;' => chr(917),
'Zeta;' => chr(918),
'Eta;' => chr(919),
'Theta;' => chr(920),
'Iota;' => chr(921),
'Kappa;' => chr(922),
'Lambda;' => chr(923),
'Mu;' => chr(924),
'Nu;' => chr(925),
'Xi;' => chr(926),
'Omicron;' => chr(927),
'Pi;' => chr(928),
'Rho;' => chr(929),
'Sigma;' => chr(931),
'Tau;' => chr(932),
'Upsilon;' => chr(933),
'Phi;' => chr(934),
'Chi;' => chr(935),
'Psi;' => chr(936),
'Omega;' => chr(937),
'alpha;' => chr(945),
'beta;' => chr(946),
'gamma;' => chr(947),
'delta;' => chr(948),
'epsilon;' => chr(949),
'zeta;' => chr(950),
'eta;' => chr(951),
'theta;' => chr(952),
'iota;' => chr(953),
'kappa;' => chr(954),
'lambda;' => chr(955),
'mu;' => chr(956),
'nu;' => chr(957),
'xi;' => chr(958),
'omicron;' => chr(959),
'pi;' => chr(960),
'rho;' => chr(961),
'sigmaf;' => chr(962),
'sigma;' => chr(963),
'tau;' => chr(964),
'upsilon;' => chr(965),
'phi;' => chr(966),
'chi;' => chr(967),
'psi;' => chr(968),
'omega;' => chr(969),
'thetasym;' => chr(977),
'upsih;' => chr(978),
'piv;' => chr(982),
'ensp;' => chr(8194),
'emsp;' => chr(8195),
'thinsp;' => chr(8201),
'zwnj;' => chr(8204),
'zwj;' => chr(8205),
'lrm;' => chr(8206),
'rlm;' => chr(8207),
'ndash;' => chr(8211),
'mdash;' => chr(8212),
'lsquo;' => chr(8216),
'rsquo;' => chr(8217),
'sbquo;' => chr(8218),
'ldquo;' => chr(8220),
'rdquo;' => chr(8221),
'bdquo;' => chr(8222),
'dagger;' => chr(8224),
'Dagger;' => chr(8225),
'bull;' => chr(8226),
'hellip;' => chr(8230),
'permil;' => chr(8240),
'prime;' => chr(8242),
'Prime;' => chr(8243),
'lsaquo;' => chr(8249),
'rsaquo;' => chr(8250),
'oline;' => chr(8254),
'frasl;' => chr(8260),
'euro;' => chr(8364),
'image;' => chr(8465),
'weierp;' => chr(8472),
'real;' => chr(8476),
'trade;' => chr(8482),
'alefsym;' => chr(8501),
'larr;' => chr(8592),
'uarr;' => chr(8593),
'rarr;' => chr(8594),
'darr;' => chr(8595),
'harr;' => chr(8596),
'crarr;' => chr(8629),
'lArr;' => chr(8656),
'uArr;' => chr(8657),
'rArr;' => chr(8658),
'dArr;' => chr(8659),
'hArr;' => chr(8660),
'forall;' => chr(8704),
'part;' => chr(8706),
'exist;' => chr(8707),
'empty;' => chr(8709),
'nabla;' => chr(8711),
'isin;' => chr(8712),
'notin;' => chr(8713),
'ni;' => chr(8715),
'prod;' => chr(8719),
'sum;' => chr(8721),
'minus;' => chr(8722),
'lowast;' => chr(8727),
'radic;' => chr(8730),
'prop;' => chr(8733),
'infin;' => chr(8734),
'ang;' => chr(8736),
'and;' => chr(8743),
'or;' => chr(8744),
'cap;' => chr(8745),
'cup;' => chr(8746),
'int;' => chr(8747),
'there4;' => chr(8756),
'sim;' => chr(8764),
'cong;' => chr(8773),
'asymp;' => chr(8776),
'ne;' => chr(8800),
'equiv;' => chr(8801),
'le;' => chr(8804),
'ge;' => chr(8805),
'sub;' => chr(8834),
'sup;' => chr(8835),
'nsub;' => chr(8836),
'sube;' => chr(8838),
'supe;' => chr(8839),
'oplus;' => chr(8853),
'otimes;' => chr(8855),
'perp;' => chr(8869),
'sdot;' => chr(8901),
'lceil;' => chr(8968),
'rceil;' => chr(8969),
'lfloor;' => chr(8970),
'rfloor;' => chr(8971),
'lang;' => chr(9001),
'rang;' => chr(9002),
'loz;' => chr(9674),
'spades;' => chr(9824),
'clubs;' => chr(9827),
'hearts;' => chr(9829),
'diams;' => chr(9830),
) : ())
);
# Make the opposite mapping
while (my($entity, $char) = each(%entity2char)) {
$entity =~ s/;\z//;
$char2entity{$char} = "&$entity;";
}
delete $char2entity{"'"}; # only one-way decoding
# Fill in missing entities
for (0 .. 255) {
next if exists $char2entity{chr($_)};
$char2entity{chr($_)} = "&#$_;";
}
my %subst; # compiled encoding regexps
sub decode_entities_old
{
my $array;
if (defined wantarray) {
$array = [@_]; # copy
} else {
$array = \@_; # modify in-place
}
my $c;
for (@$array) {
s/(&\#(\d+);?)/$2 < 256 ? chr($2) : $1/eg;
s/(&\#[xX]([0-9a-fA-F]+);?)/$c = hex($2); $c < 256 ? chr($c) : $1/eg;
s/(&(\w+);?)/$entity2char{$2} || $1/eg;
}
wantarray ? @$array : $array->[0];
}
sub encode_entities
{
return undef unless defined $_[0];
my $ref;
if (defined wantarray) {
my $x = $_[0];
$ref = \$x; # copy
} else {
$ref = \$_[0]; # modify in-place
}
if (defined $_[1] and length $_[1]) {
unless (exists $subst{$_[1]}) {
# Because we can't compile regex we fake it with a cached sub
my $chars = $_[1];
$chars =~ s,(?<!\\)([]/]),\\$1,g;
$chars =~ s,(?<!\\)\\\z,\\\\,;
my $code = "sub {\$_[0] =~ s/([$chars])/\$char2entity{\$1} || num_entity(\$1)/ge; }";
$subst{$_[1]} = eval $code;
die( $@ . " while trying to turn range: \"$_[1]\"\n "
. "into code: $code\n "
) if $@;
}
&{$subst{$_[1]}}($$ref);
} else {
# Encode control chars, high bit chars and '<', '&', '>', ''' and '"'
$$ref =~ s/([^\n\r\t !\#\$%\(-;=?-~])/$char2entity{$1} || num_entity($1)/ge;
}
$$ref;
}
sub encode_entities_numeric {
local %char2entity;
return &encode_entities; # a goto &encode_entities wouldn't work
}
sub num_entity {
sprintf "&#x%X;", ord($_[0]);
}
# Set up aliases
*encode = \&encode_entities;
*encode_numeric = \&encode_entities_numeric;
*encode_numerically = \&encode_entities_numeric;
*decode = \&decode_entities;
1;

View File

@@ -0,0 +1,684 @@
package HTML::FillInForm;
use integer; # no floating point math so far!
use strict; # and no funny business, either.
use Carp; # generate better errors with more context
# required for attr_encoded
use HTML::Parser 3.26;
# required for UNIVERSAL->can
require 5.005;
use vars qw($VERSION @ISA);
$VERSION = '2.00';
@ISA = qw(HTML::Parser);
sub new {
my ($class) = @_;
my $self = bless {}, $class;
$self->init;
# tell HTML::Parser not to decode attributes
$self->attr_encoded(1);
return $self;
}
# a few shortcuts to fill()
sub fill_file { my $self = shift; return $self->fill('file' ,@_); }
sub fill_arrayref { my $self = shift; return $self->fill('arrayref' ,@_); }
sub fill_scalarref { my $self = shift; return $self->fill('scalarref',@_); }
# track the keys we support. Useful for file-name detection.
sub _known_keys {
return {
scalarref => 1,
arrayref => 1,
fdat => 1,
fobject => 1,
file => 1,
target => 1,
fill_password => 1,
ignore_fields => 1,
disable_fields => 1,
}
}
sub fill {
my $self = shift;
# If we are called as a class method, go ahead and call new().
$self = $self->new if (not ref $self);
my %option;
# If the first arg is a scalarref, translate that to scalarref => $first_arg
if (ref $_[0] eq 'SCALAR') {
$option{scalarref} = shift;
}
elsif (ref $_[0] eq 'ARRAY') {
$option{arrayref} = shift;
}
elsif (ref $_[0] eq 'GLOB') {
$option{file} = shift;
}
elsif (ref $_[0]) {
croak "data source is not a reference type we understand";
}
# Last chance, if the first arg isn't one of the known keys, we
# assume it is a file name.
elsif (not _known_keys()->{$_[0]} ) {
$option{file} = shift;
}
else {
# Should be a known key. Nothing to do.
}
# Now, check to see if the next arg is also a reference.
my $data;
if (ref $_[0]) {
$data = shift;
$data = [$data] unless ref $data eq 'ARRAY';
for my $source (@$data) {
if (ref $source eq 'HASH') {
push @{ $option{fdat} }, $source;
}
elsif (ref $source) {
if ($source->can('param')) {
push @{ $option{fobject} }, $source;
}
else {
croak "data source $source does not supply a param method";
}
}
elsif (defined $source) {
croak "data source $source is not a hash or object reference";
}
}
}
# load in the rest of the options
%option = (%option, @_);
# As suggested in the docs, merge multiple fdats into one.
if (ref $option{fdat} eq 'ARRAY') {
my %merged;
for my $hash (@{ $option{fdat} }) {
for my $key (keys %$hash) {
$merged{$key} = $hash->{$key};
}
}
$option{'fdat'} = \%merged;
}
my %ignore_fields;
%ignore_fields = map { $_ => 1 } ( ref $option{'ignore_fields'} eq 'ARRAY' )
? @{ $option{ignore_fields} } : $option{ignore_fields} if exists( $option{ignore_fields} );
$self->{ignore_fields} = \%ignore_fields;
my %disable_fields;
%disable_fields = map { $_ => 1 } ( ref $option{'disable_fields'} eq 'ARRAY' )
? @{ $option{disable_fields} } : $option{disable_fields} if exists( $option{disable_fields} );
$self->{disable_fields} = \%disable_fields;
if (my $fdat = $option{fdat}){
# Copy the structure to prevent side-effects.
my %copy;
keys %$fdat; # reset fdat if each or Dumper was called on fdat
while(my($key, $val) = each %$fdat) {
next if exists $ignore_fields{$key};
$copy{ $key } = ref $val eq 'ARRAY' ? [ @$val ] : $val;
}
$self->{fdat} = \%copy;
}
# We want the reference to these objects to go out of scope at the
# end of the method.
local $self->{objects} = [];
if(my $objects = $option{fobject}){
unless(ref($objects) eq 'ARRAY'){
$objects = [ $objects ];
}
for my $object (@$objects){
# make sure objects in 'param_object' parameter support param()
defined($object->can('param')) or
croak("HTML::FillInForm->fill called with fobject option, containing object of type " . ref($object) . " which lacks a param() method!");
}
$self->{objects} = $objects;
}
if (my $target = $option{target}){
$self->{'target'} = $target;
}
if (defined($option{fill_password})){
$self->{fill_password} = $option{fill_password};
} else {
$self->{fill_password} = 1;
}
# make sure method has data to fill in HTML form with!
unless(exists $self->{fdat} || $self->{objects}){
croak("HTML::FillInForm->fillInForm() called without 'fobject' or 'fdat' parameter set");
}
local $self->{object_param_cache};
if(my $file = $option{file}){
$self->parse_file($file);
} elsif (my $scalarref = $option{scalarref}){
$self->parse($$scalarref);
} elsif (my $arrayref = $option{arrayref}){
for (@$arrayref){
$self->parse($_);
}
}
$self->eof;
return delete $self->{output};
}
# handles opening HTML tags such as <input ...>
sub start {
my ($self, $tagname, $attr, $attrseq, $origtext) = @_;
# set the current form
if ($tagname eq 'form') {
$self->{object_param_cache} = {};
if (exists $attr->{'name'} || exists $attr->{'id'}) {
$self->{'current_form'} = $attr->{'name'} || $attr->{'id'};
} else {
# in case of previous one without </FORM>
delete $self->{'current_form'};
}
}
# This form is not my target.
if (exists $self->{'target'} &&
(! exists $self->{'current_form'} ||
$self->{'current_form'} ne $self->{'target'})) {
$self->{'output'} .= $origtext;
return;
}
# HTML::Parser converts tagname to lowercase, so we don't need /i
if ($self->{option_no_value}) {
$self->{output} .= '>';
delete $self->{option_no_value};
}
# Check if we need to disable this field
$attr->{disable} = 1
if exists $attr->{'name'} and
exists $self->{disable_fields}{ $attr->{'name'} } and
$self->{disable_fields}{ $attr->{'name'} } and
not ( exists $attr->{disable} and $attr->{disable} );
if ($tagname eq 'input'){
my $value = exists $attr->{'name'} ? $self->_get_param($attr->{'name'}) : undef;
# force hidden fields to have a value
$value = '' if exists($attr->{'type'}) && $attr->{'type'} eq 'hidden' && ! exists $attr->{'value'} && ! defined $value;
if (defined($value)){
$value = $self->escapeHTMLStringOrList($value);
# check for input type, noting that default type is text
if (!exists $attr->{'type'} ||
$attr->{'type'} =~ /^(text|textfield|hidden|)$/i){
if ( ref($value) eq 'ARRAY' ) {
$value = shift @$value;
$value = '' unless defined $value;
}
$attr->{'value'} = $value;
} elsif (lc $attr->{'type'} eq 'password' && $self->{fill_password}) {
if ( ref($value) eq 'ARRAY' ) {
$value = shift @$value;
$value = '' unless defined $value;
}
$attr->{'value'} = $value;
} elsif (lc $attr->{'type'} eq 'radio'){
if ( ref($value) eq 'ARRAY' ) {
$value = $value->[0];
$value = '' unless defined $value;
}
# value for radio boxes default to 'on', works with netscape
$attr->{'value'} = 'on' unless exists $attr->{'value'};
if ($attr->{'value'} eq $value){
$attr->{'checked'} = 'checked';
} else {
delete $attr->{'checked'};
}
} elsif (lc $attr->{'type'} eq 'checkbox'){
# value for checkboxes default to 'on', works with netscape
$attr->{'value'} = 'on' unless exists $attr->{'value'};
delete $attr->{'checked'}; # Everything is unchecked to start
$value = [ $value ] unless ref($value) eq 'ARRAY';
foreach my $v ( @$value ) {
if ( $attr->{'value'} eq $v ) {
$attr->{'checked'} = 'checked';
}
}
# } else {
# warn(qq(Input field of unknown type "$attr->{type}": $origtext));
}
}
$self->{output} .= "<$tagname";
while (my ($key, $value) = each %$attr) {
next if $key eq '/';
$self->{output} .= sprintf qq( %s="%s"), $key, $value;
}
# extra space put here to work around Opera 6.01/6.02 bug
$self->{output} .= ' /' if $attr->{'/'};
$self->{output} .= ">";
} elsif ($tagname eq 'option'){
my $value = $self->_get_param($self->{selectName});
$value = [ $value ] unless ( ref($value) eq 'ARRAY' );
if ( defined $value->[0] ){
$value = $self->escapeHTMLStringOrList($value);
delete $attr->{selected} if exists $attr->{selected};
if(defined($attr->{'value'})){
# option tag has value attr - <OPTION VALUE="foo">bar</OPTION>
if ($self->{selectMultiple}){
# check if the option tag belongs to a multiple option select
foreach my $v ( grep { defined } @$value ) {
if ( $attr->{'value'} eq $v ){
$attr->{selected} = 'selected';
}
}
} else {
# if not every value of a fdat ARRAY belongs to a different select tag
if (not $self->{selectSelected}){
if ( $attr->{'value'} eq $value->[0]){
shift @$value if ref($value) eq 'ARRAY';
$attr->{selected} = 'selected';
$self->{selectSelected} = 1; # remeber that an option tag is selected for this select tag
}
}
}
} else {
# option tag has no value attr - <OPTION>bar</OPTION>
# save for processing under text handler
$self->{option_no_value} = $value;
}
}
$self->{output} .= "<$tagname";
while (my ($key, $value) = each %$attr) {
$self->{output} .= sprintf qq( %s="%s"), $key, $value;
}
unless ($self->{option_no_value}){
# we can close option tag here
$self->{output} .= ">";
}
} elsif ($tagname eq 'textarea'){
if ($attr->{'name'} and defined (my $value = $self->_get_param($attr->{'name'}))){
$value = $self->escapeHTMLStringOrList($value);
$value = (shift @$value || '') if ref($value) eq 'ARRAY';
# <textarea> foobar </textarea> -> <textarea> $value </textarea>
# we need to set outputText to 'no' so that 'foobar' won't be printed
$self->{outputText} = 'no';
$self->{output} .= $origtext . $value;
} else {
$self->{output} .= $origtext;
}
} elsif ($tagname eq 'select'){
$self->{selectName} = $attr->{'name'};
if (defined $attr->{'multiple'}){
$self->{selectMultiple} = 1; # helper var to remember if the select tag has the multiple attr set or not
} else {
$self->{selectMultiple} = 0;
$self->{selectSelected} = 0; # helper var to remember if an option was already selected in the current select tag
}
$self->{output} .= $origtext;
} else {
$self->{output} .= $origtext;
}
}
sub _get_param {
my ($self, $param) = @_;
return undef if $self->{ignore_fields}{$param};
return $self->{fdat}{$param} if exists $self->{fdat}{$param};
return $self->{object_param_cache}{$param} if exists $self->{object_param_cache}{$param};
# traverse the list in reverse order for backwards compatibility
# with the previous implementation.
for my $o (reverse @{$self->{objects}}) {
my @v = $o->param($param);
next unless @v;
return $self->{object_param_cache}{$param} = @v > 1 ? \@v : $v[0];
}
return undef;
}
# handles non-html text
sub text {
my ($self, $origtext) = @_;
# just output text, unless replaced value of <textarea> tag
unless(exists $self->{outputText} && $self->{outputText} eq 'no'){
if(exists $self->{option_no_value}){
# dealing with option tag with no value - <OPTION>bar</OPTION>
my $values = $self->{option_no_value};
my $value = $origtext;
$value =~ s/^\s+//;
$value =~ s/\s+$//;
foreach my $v ( @$values ) {
if ( $value eq $v ) {
$self->{output} .= ' selected="selected"';
}
}
# close <OPTION> tag
$self->{output} .= ">$origtext";
delete $self->{option_no_value};
} else {
$self->{output} .= $origtext;
}
}
}
# handles closing HTML tags such as </textarea>
sub end {
my ($self, $tagname, $origtext) = @_;
if ($self->{option_no_value}) {
$self->{output} .= '>';
delete $self->{option_no_value};
}
if($tagname eq 'select'){
delete $self->{selectName};
} elsif ($tagname eq 'textarea'){
delete $self->{outputText};
} elsif ($tagname eq 'form') {
delete $self->{'current_form'};
}
$self->{output} .= $origtext;
}
sub escapeHTMLStringOrList {
my ($self, $toencode) = @_;
if (ref($toencode) eq 'ARRAY') {
foreach my $elem (@$toencode) {
$elem = $self->escapeHTML($elem);
}
return $toencode;
} else {
return $self->escapeHTML($toencode);
}
}
sub escapeHTML {
my ($self, $toencode) = @_;
return undef unless defined($toencode);
$toencode =~ s/&/&amp;/g;
$toencode =~ s/\"/&quot;/g;
$toencode =~ s/>/&gt;/g;
$toencode =~ s/</&lt;/g;
return $toencode;
}
sub comment {
my ( $self, $text ) = @_;
# if it begins with '[if ' and doesn't end with '<![endif]'
# it's a "downlevel-revealed" conditional comment (stupid IE)
# or
# if it ends with '[endif]' then it's the end of a
# "downlevel-revealed" conditional comment
if(
(
( index($text, '[if ') == 0 )
&&
( $text !~ /<!\[endif\]$/ )
)
||
( $text eq '[endif]' )
) {
$self->{output} .= '<!' . $text . '>';
} else {
$self->{output} .= '<!--' . $text . '-->';
}
}
sub process {
my ( $self, $token0, $text ) = @_;
$self->{output} .= $text;
}
sub declaration {
my ( $self, $text ) = @_;
$self->{output} .= '<!' . $text . '>';
}
1;
__END__
=head1 NAME
HTML::FillInForm - Populates HTML Forms with data.
=head1 DESCRIPTION
This module fills in an HTML form with data from a Perl data structure, allowing you
to keep the HTML and Perl separate.
Here are two common use cases:
1. A user submits an HTML form without filling out a required field. You want
to redisplay the form with all the previous data in it, to make it easy for the
user to see and correct the error.
2. You have just retrieved a record from a database and need to display it in
an HTML form.
=head1 SYNOPSIS
Fill HTML form with data.
$output = HTML::FillInForm->fill( \$html, $q );
$output = HTML::FillInForm->fill( \@html, [$q1,$q2] );
$output = HTML::FillInForm->fill( \*HTML, \%data );
$output = HTML::FillInForm->fill( 't.html', [\%data1,%data2] );
The HTML can be provided as a scalarref, arrayref, filehandle or file. The data can come from one or more
hashrefs, or objects which support a param() method, like CGI.pm, L<Apache::Request|Apache::Request>, etc.
=head1 fill
The basic syntax is seen above the Synopsis. There are a few additional options.
=head2 Options
=head3 target => 'form1'
Suppose you have multiple forms in a html file and only want to fill in one.
$output = HTML::FillInForm->fill(\$html, $q, target => 'form1');
This will fill in only the form inside
<FORM name="form1"> ... </FORM>
=head3 fill_password => 0
Passwords are filled in by default. To disable:
fill_password => 0
=head3 ignore_fields => []
To disable the filling of some fields:
ignore_fields => ['prev','next']
=head3 disable_fields => []
To disable fields from being edited:
disable_fields => [ 'uid', 'gid' ]
=head2 File Upload fields
File upload fields cannot be supported directly. Workarounds include asking the
user to re-attach any file uploads or fancy server-side storage and
referencing. You are on your own.
=head2 Clearing Fields
Fields are cleared if you set their value to an empty string or empty arrayref but not undef:
# this will leave the form element foo untouched
HTML::FillInForm->fill(\$html, { foo => undef });
# this will set clear the form element foo
HTML::FillInForm->fill(\$html, { foo => "" });
It has been suggested to add a option to change the behavior so that undef
values will clear the form elements. Patches welcome.
=head1 Old syntax
You probably need to read no further. The remaining docs concern the
1.x era syntax, which is still supported.
=head2 new
Call C<new()> to create a new FillInForm object:
$fif = HTML::FillInForm->new;
$fif->fill(...);
In theory, there is a slight performance benefit to calling C<new()> before C<fill()> if you make multiple
calls to C<fill()> before you destroy the object. Benchmark before optimizing.
=head2 fill ( old syntax )
Instead of having your HTML and data types auto-detected, you can declare them explicitly in your
call to C<fill()>:
HTML source options:
arrayref => @html
scalarref => $html
file => \*HTML
file => 't.html'
Fill Data options:
fobject => $data_obj # with param() method
fdat => \%data
Additional methods are also available:
fill_file(\*HTML,...);
fill_file('t.html',...);
fill_arrayref(\@html,...);
fill_scalarref(\$html,...);
=head1 CALLING FROM OTHER MODULES
=head2 Apache::PageKit
To use HTML::FillInForm in L<Apache::PageKit> is easy. It is
automatically called for any page that includes a <form> tag.
It can be turned on or off by using the C<fill_in_form> configuration
option.
=head2 Apache::ASP v2.09 and above
HTML::FillInForm is now integrated with Apache::ASP. To activate, use
PerlSetVar FormFill 1
$Response->{FormFill} = 1
=head2 HTML::Mason
Using HTML::FillInForm from HTML::Mason is covered in the FAQ on
the masonhq.com website at
L<http://www.masonhq.com/?FAQ:HTTPAndHTML#h-how_can_i_populate_form_values_automatically_>
=head1 VERSION
This documentation describes HTML::FillInForm module version 2.00
=head1 SECURITY
Note that you might want to think about caching issues if you have password
fields on your page. There is a discussion of this issue at
http://www.perlmonks.org/index.pl?node_id=70482
In summary, some browsers will cache the output of CGI scripts, and you
can control this by setting the Expires header. For example, use
C<-expires> in L<CGI.pm> or set C<browser_cache> to I<no> in
Config.xml file of L<Apache::PageKit>.
=head1 TRANSLATION
Kato Atsushi has translated these docs into Japanese, available from
http://perldoc.jp
=head1 BUGS
Please submit any bug reports to tjmather@maxmind.com.
=head1 NOTES
Requires Perl 5.005 and L<HTML::Parser> version 3.26.
I wrote this module because I wanted to be able to insert CGI data
into HTML forms,
but without combining the HTML and Perl code. CGI.pm and Embperl allow you so
insert CGI data into forms, but require that you mix HTML with Perl.
There is a nice review of the module available here:
L<http://www.perlmonks.org/index.pl?node_id=274534>
=head1 AUTHOR
(c) 2005 TJ Mather, tjmather@maxmind.com, L<http://www.maxmind.com/>
All rights reserved. This package is free software; you can
redistribute it and/or modify it under the same terms as Perl itself.
=head1 SEE ALSO
L<HTML::Parser|HTML::Parser>,
L<Data::FormValidator|Data::FormValidato>,
L<HTML::Template|HTML::Template>,
L<Apache::PageKit|Apache::PageKit>
=head1 CREDITS
Fixes, Bug Reports, Docs have been generously provided by:
Tatsuhiko Miyagawa Joseph Yanni
Boris Zentner Philip Mak
Dave Rolsky Jost Krieger
Patrick Michael Kane Gabriel Burka
Ade Olonoh Bill Moseley
Tom Lancaster James Tolley
Martin H Sluka Dan Kubb
Mark Stosberg Alexander Hartmaier
Jonathan Swartz Paul Miller
Trevor Schellhorn Anthony Ettinger
Jim Miner Simon P. Ditner
Paul Lindner Michael Peters
Maurice Aubrey Trevor Schellhorn
Andrew Creer
Thanks!

View File

@@ -0,0 +1,112 @@
package HTML::Filter;
use strict;
use vars qw(@ISA $VERSION);
require HTML::Parser;
@ISA=qw(HTML::Parser);
$VERSION = "3.57";
sub declaration { $_[0]->output("<!$_[1]>") }
sub process { $_[0]->output($_[2]) }
sub comment { $_[0]->output("<!--$_[1]-->") }
sub start { $_[0]->output($_[4]) }
sub end { $_[0]->output($_[2]) }
sub text { $_[0]->output($_[1]) }
sub output { print $_[1] }
1;
__END__
=head1 NAME
HTML::Filter - Filter HTML text through the parser
=head1 NOTE
B<This module is deprecated.> The C<HTML::Parser> now provides the
functionally of C<HTML::Filter> much more efficiently with the the
C<default> handler.
=head1 SYNOPSIS
require HTML::Filter;
$p = HTML::Filter->new->parse_file("index.html");
=head1 DESCRIPTION
C<HTML::Filter> is an HTML parser that by default prints the
original text of each HTML element (a slow version of cat(1) basically).
The callback methods may be overridden to modify the filtering for some
HTML elements and you can override output() method which is called to
print the HTML text.
C<HTML::Filter> is a subclass of C<HTML::Parser>. This means that
the document should be given to the parser by calling the $p->parse()
or $p->parse_file() methods.
=head1 EXAMPLES
The first example is a filter that will remove all comments from an
HTML file. This is achieved by simply overriding the comment method
to do nothing.
package CommentStripper;
require HTML::Filter;
@ISA=qw(HTML::Filter);
sub comment { } # ignore comments
The second example shows a filter that will remove any E<lt>TABLE>s
found in the HTML file. We specialize the start() and end() methods
to count table tags and then make output not happen when inside a
table.
package TableStripper;
require HTML::Filter;
@ISA=qw(HTML::Filter);
sub start
{
my $self = shift;
$self->{table_seen}++ if $_[0] eq "table";
$self->SUPER::start(@_);
}
sub end
{
my $self = shift;
$self->SUPER::end(@_);
$self->{table_seen}-- if $_[0] eq "table";
}
sub output
{
my $self = shift;
unless ($self->{table_seen}) {
$self->SUPER::output(@_);
}
}
If you want to collect the parsed text internally you might want to do
something like this:
package FilterIntoString;
require HTML::Filter;
@ISA=qw(HTML::Filter);
sub output { push(@{$_[0]->{fhtml}}, $_[1]) }
sub filtered_html { join("", @{$_[0]->{fhtml}}) }
=head1 SEE ALSO
L<HTML::Parser>
=head1 COPYRIGHT
Copyright 1997-1999 Gisle Aas.
This library is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.
=cut

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,301 @@
package HTML::HeadParser;
=head1 NAME
HTML::HeadParser - Parse <HEAD> section of a HTML document
=head1 SYNOPSIS
require HTML::HeadParser;
$p = HTML::HeadParser->new;
$p->parse($text) and print "not finished";
$p->header('Title') # to access <title>....</title>
$p->header('Content-Base') # to access <base href="http://...">
$p->header('Foo') # to access <meta http-equiv="Foo" content="...">
$p->header('X-Meta-Author') # to access <meta name="author" content="...">
$p->header('X-Meta-Charset') # to access <meta charset="...">
=head1 DESCRIPTION
The C<HTML::HeadParser> is a specialized (and lightweight)
C<HTML::Parser> that will only parse the E<lt>HEAD>...E<lt>/HEAD>
section of an HTML document. The parse() method
will return a FALSE value as soon as some E<lt>BODY> element or body
text are found, and should not be called again after this.
Note that the C<HTML::HeadParser> might get confused if raw undecoded
UTF-8 is passed to the parse() method. Make sure the strings are
properly decoded before passing them on.
The C<HTML::HeadParser> keeps a reference to a header object, and the
parser will update this header object as the various elements of the
E<lt>HEAD> section of the HTML document are recognized. The following
header fields are affected:
=over 4
=item Content-Base:
The I<Content-Base> header is initialized from the E<lt>base
href="..."> element.
=item Title:
The I<Title> header is initialized from the E<lt>title>...E<lt>/title>
element.
=item Isindex:
The I<Isindex> header will be added if there is a E<lt>isindex>
element in the E<lt>head>. The header value is initialized from the
I<prompt> attribute if it is present. If no I<prompt> attribute is
given it will have '?' as the value.
=item X-Meta-Foo:
All E<lt>meta> elements containing a C<name> attribute will result in
headers using the prefix C<X-Meta-> appended with the value of the
C<name> attribute as the name of the header, and the value of the
C<content> attribute as the pushed header value.
E<lt>meta> elements containing a C<http-equiv> attribute will result
in headers as in above, but without the C<X-Meta-> prefix in the
header name.
E<lt>meta> elements containing a C<charset> attribute will result in
an C<X-Meta-Charset> header, using the value of the C<charset>
attribute as the pushed header value.
=back
=head1 METHODS
The following methods (in addition to those provided by the
superclass) are available:
=over 4
=cut
require HTML::Parser;
@ISA = qw(HTML::Parser);
use HTML::Entities ();
use strict;
use vars qw($VERSION $DEBUG);
#$DEBUG = 1;
$VERSION = "3.62";
=item $hp = HTML::HeadParser->new
=item $hp = HTML::HeadParser->new( $header )
The object constructor. The optional $header argument should be a
reference to an object that implement the header() and push_header()
methods as defined by the C<HTTP::Headers> class. Normally it will be
of some class that is a or delegates to the C<HTTP::Headers> class.
If no $header is given C<HTML::HeadParser> will create an
C<HTTP::Headers> object by itself (initially empty).
=cut
sub new
{
my($class, $header) = @_;
unless ($header) {
require HTTP::Headers;
$header = HTTP::Headers->new;
}
my $self = $class->SUPER::new(api_version => 3,
start_h => ["start", "self,tagname,attr"],
end_h => ["end", "self,tagname"],
text_h => ["text", "self,text"],
ignore_elements => [qw(script style)],
);
$self->{'header'} = $header;
$self->{'tag'} = ''; # name of active element that takes textual content
$self->{'text'} = ''; # the accumulated text associated with the element
$self;
}
=item $hp->header;
Returns a reference to the header object.
=item $hp->header( $key )
Returns a header value. It is just a shorter way to write
C<$hp-E<gt>header-E<gt>header($key)>.
=cut
sub header
{
my $self = shift;
return $self->{'header'} unless @_;
$self->{'header'}->header(@_);
}
sub as_string # legacy
{
my $self = shift;
$self->{'header'}->as_string;
}
sub flush_text # internal
{
my $self = shift;
my $tag = $self->{'tag'};
my $text = $self->{'text'};
$text =~ s/^\s+//;
$text =~ s/\s+$//;
$text =~ s/\s+/ /g;
print "FLUSH $tag => '$text'\n" if $DEBUG;
if ($tag eq 'title') {
HTML::Entities::decode($text);
$self->{'header'}->push_header(Title => $text);
}
$self->{'tag'} = $self->{'text'} = '';
}
# This is an quote from the HTML3.2 DTD which shows which elements
# that might be present in a <HEAD>...</HEAD>. Also note that the
# <HEAD> tags themselves might be missing:
#
# <!ENTITY % head.content "TITLE & ISINDEX? & BASE? & STYLE? &
# SCRIPT* & META* & LINK*">
#
# <!ELEMENT HEAD O O (%head.content)>
#
# From HTML 4.01:
#
# <!ENTITY % head.misc "SCRIPT|STYLE|META|LINK|OBJECT">
# <!ENTITY % head.content "TITLE & BASE?">
# <!ELEMENT HEAD O O (%head.content;) +(%head.misc;)>
#
# Added in HTML 5 as of WD-html5-20090423: noscript, command
sub start
{
my($self, $tag, $attr) = @_; # $attr is reference to a HASH
print "START[$tag]\n" if $DEBUG;
$self->flush_text if $self->{'tag'};
if ($tag eq 'meta') {
my $key = $attr->{'http-equiv'};
if (!defined($key) || !length($key)) {
if ($attr->{name}) {
$key = "X-Meta-\u$attr->{name}";
} elsif ($attr->{charset}) { # HTML 5 <meta charset="...">
$key = "X-Meta-Charset";
$self->{header}->push_header($key => $attr->{charset});
return;
} else {
return;
}
}
$self->{'header'}->push_header($key => $attr->{content});
} elsif ($tag eq 'base') {
return unless exists $attr->{href};
$self->{'header'}->push_header('Content-Base' => $attr->{href});
} elsif ($tag eq 'isindex') {
# This is a non-standard header. Perhaps we should just ignore
# this element
$self->{'header'}->push_header(Isindex => $attr->{prompt} || '?');
} elsif ($tag =~ /^(?:title|noscript|object|command)$/) {
# Just remember tag. Initialize header when we see the end tag.
$self->{'tag'} = $tag;
} elsif ($tag eq 'link') {
return unless exists $attr->{href};
# <link href="http:..." rel="xxx" rev="xxx" title="xxx">
my $h_val = "<" . delete($attr->{href}) . ">";
for (sort keys %{$attr}) {
next if $_ eq "/"; # XHTML junk
$h_val .= qq(; $_="$attr->{$_}");
}
$self->{'header'}->push_header(Link => $h_val);
} elsif ($tag eq 'head' || $tag eq 'html') {
# ignore
} else {
# stop parsing
$self->eof;
}
}
sub end
{
my($self, $tag) = @_;
print "END[$tag]\n" if $DEBUG;
$self->flush_text if $self->{'tag'};
$self->eof if $tag eq 'head';
}
sub text
{
my($self, $text) = @_;
print "TEXT[$text]\n" if $DEBUG;
unless ($self->{first_chunk}) {
# drop Unicode BOM if found
if ($self->utf8_mode) {
$text =~ s/^\xEF\xBB\xBF//;
}
else {
$text =~ s/^\x{FEFF}//;
}
$self->{first_chunk}++;
}
my $tag = $self->{tag};
if (!$tag && $text =~ /\S/) {
# Normal text means start of body
$self->eof;
return;
}
return if $tag ne 'title';
$self->{'text'} .= $text;
}
BEGIN {
*utf8_mode = sub { 1 } unless HTML::Entities::UNICODE_SUPPORT;;
}
1;
__END__
=back
=head1 EXAMPLE
$h = HTTP::Headers->new;
$p = HTML::HeadParser->new($h);
$p->parse(<<EOT);
<title>Stupid example</title>
<base href="http://www.linpro.no/lwp/">
Normal text starts here.
EOT
undef $p;
print $h->title; # should print "Stupid example"
=head1 SEE ALSO
L<HTML::Parser>, L<HTTP::Headers>
The C<HTTP::Headers> class is distributed as part of the
I<libwww-perl> package. If you don't have that distribution installed
you need to provide the $header argument to the C<HTML::HeadParser>
constructor with your own object that implements the documented
protocol.
=head1 COPYRIGHT
Copyright 1996-2001 Gisle Aas. All rights reserved.
This library is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.
=cut

View File

@@ -0,0 +1,185 @@
package HTML::LinkExtor;
require HTML::Parser;
@ISA = qw(HTML::Parser);
$VERSION = "3.60";
=head1 NAME
HTML::LinkExtor - Extract links from an HTML document
=head1 SYNOPSIS
require HTML::LinkExtor;
$p = HTML::LinkExtor->new(\&cb, "http://www.perl.org/");
sub cb {
my($tag, %links) = @_;
print "$tag @{[%links]}\n";
}
$p->parse_file("index.html");
=head1 DESCRIPTION
I<HTML::LinkExtor> is an HTML parser that extracts links from an
HTML document. The I<HTML::LinkExtor> is a subclass of
I<HTML::Parser>. This means that the document should be given to the
parser by calling the $p->parse() or $p->parse_file() methods.
=cut
use strict;
use HTML::Tagset ();
# legacy (some applications grabs this hash directly)
use vars qw(%LINK_ELEMENT);
*LINK_ELEMENT = \%HTML::Tagset::linkElements;
=over 4
=item $p = HTML::LinkExtor->new
=item $p = HTML::LinkExtor->new( $callback )
=item $p = HTML::LinkExtor->new( $callback, $base )
The constructor takes two optional arguments. The first is a reference
to a callback routine. It will be called as links are found. If a
callback is not provided, then links are just accumulated internally
and can be retrieved by calling the $p->links() method.
The $base argument is an optional base URL used to absolutize all URLs found.
You need to have the I<URI> module installed if you provide $base.
The callback is called with the lowercase tag name as first argument,
and then all link attributes as separate key/value pairs. All
non-link attributes are removed.
=cut
sub new
{
my($class, $cb, $base) = @_;
my $self = $class->SUPER::new(
start_h => ["_start_tag", "self,tagname,attr"],
report_tags => [keys %HTML::Tagset::linkElements],
);
$self->{extractlink_cb} = $cb;
if ($base) {
require URI;
$self->{extractlink_base} = URI->new($base);
}
$self;
}
sub _start_tag
{
my($self, $tag, $attr) = @_;
my $base = $self->{extractlink_base};
my $links = $HTML::Tagset::linkElements{$tag};
$links = [$links] unless ref $links;
my @links;
my $a;
for $a (@$links) {
next unless exists $attr->{$a};
push(@links, $a, $base ? URI->new($attr->{$a}, $base)->abs($base)
: $attr->{$a});
}
return unless @links;
$self->_found_link($tag, @links);
}
sub _found_link
{
my $self = shift;
my $cb = $self->{extractlink_cb};
if ($cb) {
&$cb(@_);
} else {
push(@{$self->{'links'}}, [@_]);
}
}
=item $p->links
Returns a list of all links found in the document. The returned
values will be anonymous arrays with the following elements:
[$tag, $attr => $url1, $attr2 => $url2,...]
The $p->links method will also truncate the internal link list. This
means that if the method is called twice without any parsing
between them the second call will return an empty list.
Also note that $p->links will always be empty if a callback routine
was provided when the I<HTML::LinkExtor> was created.
=cut
sub links
{
my $self = shift;
exists($self->{'links'}) ? @{delete $self->{'links'}} : ();
}
# We override the parse_file() method so that we can clear the links
# before we start a new file.
sub parse_file
{
my $self = shift;
delete $self->{'links'};
$self->SUPER::parse_file(@_);
}
=back
=head1 EXAMPLE
This is an example showing how you can extract links from a document
received using LWP:
use LWP::UserAgent;
use HTML::LinkExtor;
use URI::URL;
$url = "http://www.perl.org/"; # for instance
$ua = LWP::UserAgent->new;
# Set up a callback that collect image links
my @imgs = ();
sub callback {
my($tag, %attr) = @_;
return if $tag ne 'img'; # we only look closer at <img ...>
push(@imgs, values %attr);
}
# Make the parser. Unfortunately, we don't know the base yet
# (it might be different from $url)
$p = HTML::LinkExtor->new(\&callback);
# Request document and parse it as it arrives
$res = $ua->request(HTTP::Request->new(GET => $url),
sub {$p->parse($_[0])});
# Expand all image URLs to absolute ones
my $base = $res->base;
@imgs = map { $_ = url($_, $base)->abs; } @imgs;
# Print them out
print join("\n", @imgs), "\n";
=head1 SEE ALSO
L<HTML::Parser>, L<HTML::Tagset>, L<LWP>, L<URI::URL>
=head1 COPYRIGHT
Copyright 1996-2001 Gisle Aas.
This library is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.
=cut
1;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,213 @@
package HTML::Perlinfo;
BEGIN { %Seen = %INC }
use strict;
use warnings;
use Carp ();
use HTML::Perlinfo::Apache;
use HTML::Perlinfo::Modules;
use HTML::Perlinfo::Common;
use base qw(Exporter HTML::Perlinfo::Base);
our @EXPORT = qw(perlinfo);
our $VERSION = '1.61';
sub perlinfo {
my ($opt) = @_;
$opt = 'INFO_ALL' unless $opt;
error_msg("Invalid perlinfo() parameter: @_")
if (($opt !~ /^INFO_(?:ALL|GENERAL|CONFIG|VARIABLES|APACHE|MODULES|LICENSE|LOADED)$/) || @_ > 1);
$opt = lc $opt;
my $p = HTML::Perlinfo->new();
$p->$opt;
}
%INC = %HTML::Perlinfo::Seen;
1;
__END__
=pod
=head1 NAME
HTML::Perlinfo - Display a lot of Perl information in HTML format
=head1 SYNOPSIS
use HTML::Perlinfo;
perlinfo();
use HTML::Perlinfo;
use CGI qw(header);
$|++;
print header;
perlinfo(INFO_MODULES);
=head1 DESCRIPTION
This module outputs a large amount of information about your Perl installation in HTML. So far, this includes information about Perl compilation options, the Perl version, server information and environment, HTTP headers, OS version information, Perl modules, and more.
HTML::Perlinfo is aimed at Web developers, but almost anyone using Perl may find it useful. It is a valuable debugging tool as it contains all EGPCS (Environment, GET, POST, Cookie, Server) data. It will also work under taint mode.
The output may be customized by passing one of the following options.
=head1 OPTIONS
There are 8 options to pass to the perlinfo funtion. All of these options are also object methods. The key difference is their case: Captilize the option name when passing it to the function and use only lower-case letters when using the object-oriented approach.
=over
=item INFO_GENERAL
The Perl version, build date, and more.
=item INFO_VARIABLES
Shows all predefined variables from EGPCS (Environment, GET, POST, Cookie, Server).
=item INFO_CONFIG
All configuration values from config_sh. INFO_ALL shows only some values.
=item INFO_APACHE
Apache HTTP server information, including mod_perl information.
=item INFO_MODULES
All installed modules, their version number and description. INFO_ALL shows only core modules.
Please also see L<HTML::Perlinfo::Modules>.
=item INFO_LOADED
Post-execution dump of loaded modules (plus INFO_VARIABLES). INFO_ALL shows only core modules. Please also see L<HTML::Perlinfo::Loaded>.
=item INFO_LICENSE
Perl license information.
=item INFO_ALL
Shows all of the above defaults. This is the default value.
=back
=head1 PROGRAMMING STYLE
There are two styles of programming with Perlinfo.pm, a function-oriented style and an object-oriented style.
Function-oriented style:
# Show all information, defaults to INFO_ALL
perlinfo();
# Show only module information. This shows all installed modules.
perlinfo(INFO_MODULES);
Object-oriented style:
$p = new HTML::Perlinfo;
$p->info_all;
# You can also set the CSS values in the constructor!
$p = HTML::Perlinfo->new(
bg_image => 'http://i104.photobucket.com/albums/m176/perlinfo/camel.gif',
bg_repeat => 'yes-repeat'
);
$p->info_all;
More examples ...
# This is wrong (no capitals)
$p->INFO_MODULES;
# But this is correct
perlinfo(INFO_MODULES);
# Ditto
$p->info_modules;
=head1 CUSTOMIZING THE HTML
You can capture the HTML output and manipulate it or you can alter CSS elements with object attributes.
For further details and examples, please see the L<HTML documentation|HTML::Perlinfo::HTML> in the HTML::Perlinfo distribution.
=head1 SECURITY
Displaying detailed server information on the internet is not a good idea and HTML::Perlinfo reveals a lot of information about the local environment. While restricting what system users can publish online is wise, you can also hinder them from using the module by installing it outside of the usual module directories (see perldoc -q lib). Of course, preventing users from installing the module in their own home directories is another matter entirely.
=head1 REQUIREMENTS
HTML::Perlinfo does not require any non-core modules.
=head1 NOTES
INFO_APACHE relies soley on environment variables.
INFO_VARIABLES did not work correctly until version 1.52.
INFO_LOADED is the only option whose output cannot be assigned to a scalar.
Some might notice that HTML::Perlinfo shares the look and feel of the PHP function phpinfo. It was originally inspired by that function and was first released in 2004 as PHP::Perlinfo, which is no longer available on CPAN.
Since the module outputs HTML, you may want to use it in a CGI script, but you do not have to. Of course, some information, like HTTP headers, would not be available if you use the module at the command-line. If you decide to use this module in a CGI script, B<make sure you print out the content-type header beforehand>. For example:
use HTML::Perlinfo;
print "Content-type: text/html\n\n";
perlinfo();
I prefer to use the header function from the CGI module:
use HTML::Perlinfo;
use CGI qw(header);
print header;
perlinfo();
In this example, I am flushing the buffer because I know that there will be a lot of modules:
use HTML::Perlinfo;
use CGI qw(header);
$|++;
print header;
perlinfo(INFO_MODULES);
HTML::Perlinfo stopped printing the header automatically as of version 1.43.
=head1 BUGS
Please report any bugs or feature requests to C<bug-html-perlinfo@rt.cpan.org>, or through the web interface at
L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=HTML-Perlinfo>.
I will be notified, and then you'll automatically be notified of progress on your bug as I make changes.
=head1 SEE ALSO
L<Config>. You can also use "perl -V" to see a configuration summary at the command-line.
L<CGI::Carp::Fatals>, L<Apache::Status>, L<App::Info>, L<Probe::Perl>, L<Module::CoreList>, L<Module::Info>, among others.
Also included in the Perlinfo distribution: L<perlinfo>, L<HTML::Perlinfo::Loaded>, L<HTML::Perlinfo::Modules>
=head1 AUTHOR
Mike Accardo <mikeaccardo@yahoo.com>
=head1 COPYRIGHT
Copyright (c) 2004-9, Mike Accardo. All Rights Reserved.
This module is free software. It may be used, redistributed
and/or modified under the terms of the Perl Artistic License.
=cut

View File

@@ -0,0 +1,100 @@
package HTML::Perlinfo::Apache;
use warnings;
use strict;
use HTML::Perlinfo::Common;
sub new {
my %apache;
my $env = ( $ENV{SERVER_SOFTWARE} || "" ) =~ /apache/i;
my $mp = exists $ENV{MOD_PERL};
$apache{'env'} = $env;
$apache{'mp'} = $mp;
bless \%apache;
}
sub has {
my ( $self, @opts ) = @_;
for my $opt (@opts) {
return 0 unless $self->{$opt};
}
return 1;
}
sub print_apache {
my $self = shift;
my @mods;
my ($version, $hostname, $port, $mp_status) = ("<i>Not detected</i>") x 7;
$mp_status = 'enabled' if $self->has qw(mp);
($version) = $ENV{'SERVER_SOFTWARE'} =~ /(\d+[\.\d]*)/
if (defined $ENV{'SERVER_SOFTWARE'} && $ENV{'SERVER_SOFTWARE'} =~ /\d+[\.\d]*/);
return join '', print_table_row(2, "Apache Version", "$version"),
(defined($ENV{'SERVER_NAME'}) && defined($ENV{'SERVER_PORT'})) ?
print_table_row(2, "Hostname:Port", "$ENV{'SERVER_NAME'} : $ENV{'SERVER_PORT'}"):
print_table_row(2, "Hostname:Port", "$hostname : $port"),
print_table_row(2, "mod_perl", "$mp_status");
}
sub print_modperl {
my ($version_status, $version_number) = ("<i>Not detected</i>") x 3;
$version_status = '1.0';
$version_status = '2.0 or higher' if $ENV{MOD_PERL_API_VERSION} && $ENV{MOD_PERL_API_VERSION} == 2;
($version_number) = $ENV{MOD_PERL} =~ /^\S+\/(\d+(?:[\.\_]\d+)+)/;
$version_number =~ s/_//g;
$version_number =~ s/(\.[^.]+)\./$1/g;
unless ($version_status eq '2.0 or higher') {
if ( $version_number >= 1.9901 ) {
$version_status = '1.9 which is incompatible with 2.0';
}
}
return join '', print_box_start(0),
"Running under mod_perl version $version_status (version number: $version_number)",
print_box_end();
}
sub print_apache_environment {
return join '', print_table_row(2, "DOCUMENT_ROOT", $ENV{'DOCUMENT_ROOT'}),
print_table_row(2, "HTTP_ACCEPT", $ENV{'HTTP_ACCEPT'}),
print_table_row(2, "HTTP_ACCEPT_CHARSET", $ENV{'HTTP_ACCEPT_CHARSET'}),
print_table_row(2, "HTTP_ACCEPT_ENCODING", $ENV{'HTTP_ACCEPT_ENCODING'}),
print_table_row(2, "HTTP_ACCEPT_LANGUAGE", $ENV{'HTTP_ACCEPT_LANGUAGE'}),
print_table_row(2, "HTTP_CONNECTION", $ENV{'HTTP_CONNECTION'}),
print_table_row(2, "HTTP_HOSTS", $ENV{'HTTP_HOSTS'}),
print_table_row(2, "HTTP_KEEP_ALIVE", $ENV{'HTTP_KEEP_ALIVE'}),
print_table_row(2, "HTTP_USER_AGENT", $ENV{'HTTP_USER_AGENT'}),
print_table_row(2, "PATH", $ENV{'PATH'}),
print_table_row(2, "REMOTE_ADDR", $ENV{'REMOTE_ADDR'}),
print_table_row(2, "REMOTE_HOST", $ENV{'REMOTE_HOST'}),
print_table_row(2, "REMOTE_PORT", $ENV{'REMOTE_PORT'}),
print_table_row(2, "SCRIPT_FILENAME", $ENV{'SCRIPT_FILENAME'}),
print_table_row(2, "SCRIPT_URI", $ENV{'SCRIPT_URI'}),
print_table_row(2, "SCRIPT_URL", $ENV{'SCRIPT_URL'}),
print_table_row(2, "SERVER_ADDR", $ENV{'SERVER_ADDR'}),
print_table_row(2, "SERVER_ADMIN", $ENV{'SERVER_ADMIN'}),
print_table_row(2, "SERVER_NAME", $ENV{'SERVER_NAME'}),
print_table_row(2, "SERVER_PORT", $ENV{'SERVER_PORT'}),
print_table_row(2, "SERVER_SIGNATURE", $ENV{'SERVER_SIGNATURE'}),
print_table_row(2, "SERVER_SOFTWARE", $ENV{'SERVER_SOFTWARE'}),
print_table_row(2, "GATEWAY_INTERFACE", $ENV{'GATEWAY_INTERFACE'}),
print_table_row(2, "SERVER_PROTOCOL", $ENV{'SERVER_PROTOCOL'}),
print_table_row(2, "REQUEST_METHOD", $ENV{'REQUEST_METHOD'}),
print_table_row(2, "QUERY_STRING", $ENV{'QUERY_STRING'}),
print_table_row(2, "REQUEST_URI", $ENV{'REQUEST_URI'}),
print_table_row(2, "SCRIPT_NAME", $ENV{'SCRIPT_NAME'});
}
1;

View File

@@ -0,0 +1,249 @@
package HTML::Perlinfo::Base;
use HTML::Perlinfo::Common;
use HTML::Perlinfo::General;
use Carp ();
use warnings;
use strict;
sub new {
my ($class, %params) = @_;
my $self = {};
$self->{full_page} = 1;
$self->{title} = 0;
$self->{bg_image} = '';
$self->{bg_position} = 'center';
$self->{bg_repeat} = 'no_repeat';
$self->{bg_attribute} = 'fixed';
$self->{bg_color} = '#ffffff';
$self->{ft_family} = 'sans-serif';
$self->{ft_color} = '#000000';
$self->{lk_color} = '#000099';
$self->{lk_decoration} = 'none';
$self->{lk_bgcolor} = '';
$self->{lk_hvdecoration} = 'underline';
$self->{header_bgcolor} = '#9999cc';
$self->{header_ftcolor} = '#000000';
$self->{leftcol_bgcolor} = '#ccccff';
$self->{leftcol_ftcolor} = '#000000';
$self->{rightcol_bgcolor} = '#cccccc';
$self->{rightcol_ftcolor} = '#000000';
foreach my $key (keys %params) {
if (exists $self->{$key}) {
$self->{$key} = $params{$key};
}
else {
error_msg("$key is an invalid attribute");
}
}
bless $self, $class;
return $self;
}
sub info_all {
my $self = shift;
my %param = @_;
error_msg("invalid parameter") if (defined $_[0] && exists $param{'links'} && ref $param{'links'} ne 'ARRAY');
$self->links(@{$param{'links'}}) if exists $param{'links'};
my $html;
$self->{title} = 'perlinfo(INFO_ALL)' unless $self->{title};
$html .= $self->print_htmlhead() if $self->{full_page};
$html .= print_general();
$html .= print_variables();
$html .= print_thesemodules('core');
$html .= print_license();
$html .= "</div></body></html>" if $self->{full_page};
defined wantarray ? return $html : print $html;
}
sub info_general {
my $self = shift;
my %param = @_;
error_msg("invalid parameter") if (defined $_[0] && exists $param{'links'} && ref $param{'links'} ne 'ARRAY');
$self->links(@{$param{'links'}}) if exists $param{'links'};
my $html;
$self->{title} = 'perlinfo(INFO_GENERAL)' unless $self->{title};
$html .= $self->print_htmlhead() if $self->{full_page};
$html .= print_general('top');
$html .= "</div></body></html>" if $self->{full_page};
defined wantarray ? return $html : print $html;
}
sub info_loaded {
my $self = shift;
$self->{'title'} = 'perlinfo(INFO_LOADED)' unless $self->{'title'};
my $html;
$html .= $self->print_htmlhead() if $self->{'full_page'};
eval qq{
END {
delete \$INC{'HTML/Perlinfo.pm'};
\$html .= print_thesemodules('loaded',[values %INC]);
\$html .= print_variables();
\$html .= '</div></body></html>' if \$self->{'full_page'};
print \$html;
}
}; die $@ if $@;
}
sub info_modules {
my $self = shift;
my %param = @_;
error_msg("invalid parameter") if (defined $_[0] && exists $param{'links'} && ref $param{'links'} ne 'ARRAY');
$self->links(@{$param{'links'}}) if exists $param{'links'};
my $html;
$self->{title} = 'perlinfo(INFO_MODULES)' unless $self->{title};
$html .= $self->print_htmlhead() if $self->{'full_page'};
$html .= print_thesemodules('all');
$html .= "</div></body></html>" if $self->{'full_page'};
defined wantarray ? return $html : print $html;
}
sub info_config {
my $self = shift;
my %param = @_;
error_msg("invalid parameter") if (defined $_[0] && exists $param{'links'} && ref $param{'links'} ne 'ARRAY');
$self->links(@{$param{'links'}}) if exists $param{'links'};
my $html;
$self->{title} = 'perlinfo(INFO_CONFIG)' unless $self->{title};
$html .= $self->print_htmlhead() if $self->{full_page};
$html .= print_config('info_config');
$html .= "</div></body></html>" if $self->{full_page};
defined wantarray ? return $html : print $html;
}
sub info_apache {
my $self = shift;
my %param = @_;
error_msg("invalid parameter") if (defined $_[0] && exists $param{'links'} && ref $param{'links'} ne 'ARRAY');
$self->links(@{$param{'links'}}) if exists $param{'links'};
my $html;
$self->{title} = 'perlinfo(INFO_APACHE)' unless $self->{title};
$html .= $self->print_htmlhead() if $self->{full_page};
$html .= print_httpd();
$html .= "</div></body></html>" if $self->{full_page};
defined wantarray ? return $html : print $html;
}
sub info_variables {
my $self = shift;
my %param = @_;
error_msg("invalid parameter") if (defined $_[0] && exists $param{'links'} && ref $param{'links'} ne 'ARRAY');
$self->links(@{$param{'links'}}) if exists $param{'links'};
my $html;
$self->{title} = 'perlinfo(INFO_VARIABLES)' unless $self->{title};
$html .= $self->print_htmlhead() if $self->{full_page};
$html .= print_variables();
$html .= "</div></body></html>" if $self->{full_page};
defined wantarray ? return $html : print $html;
}
sub info_license {
my $self = shift;
my %param = @_;
error_msg("invalid parameter") if (defined $_[0] && exists $param{'links'} && ref $param{'links'} ne 'ARRAY');
$self->links(@{$param{'links'}}) if exists $param{'links'};
my $html;
$self->{title} = 'perlinfo(INFO_LICENSE)' unless $self->{title};
$html .= $self->print_htmlhead() if $self->{full_page};
$html .= print_license();
$html .= "</div></body></html>" if $self->{full_page};
defined wantarray ? return $html : print $html;
}
sub print_htmlhead {
my $self = shift;
my $title = $self->{title};
my $bg_image = $self->{bg_image};
my $bg_position = $self->{bg_position};
my $bg_repeat = $self->{bg_repeat};
my $bg_attribute = $self->{bg_attribute};
my $bg_color = $self->{bg_color};
my $ft_family = $self->{ft_family};
my $ft_color = $self->{ft_color};
my $lk_color = $self->{lk_color};
my $lk_decoration = $self->{lk_decoration};
my $lk_bgcolor = $self->{lk_bgcolor};
my $lk_hvdecoration = $self->{lk_hvdecoration};
my $header_bgcolor = $self->{header_bgcolor};
my $header_ftcolor = $self->{header_ftcolor};
my $leftcol_bgcolor =$self->{leftcol_bgcolor};
my $leftcol_ftcolor = $self->{leftcol_ftcolor};
my $rightcol_bgcolor = $self->{rightcol_bgcolor};
my $rightcol_ftcolor = $self->{rightcol_ftcolor};
my $html = <<"END_OF_HTML";
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<style type="text/css"><!--
body {
background-color: $bg_color;
background-image: url($bg_image);
background-position: $bg_position;
background-repeat: $bg_repeat;
background-attachment: $bg_attribute;
color: $ft_color;}
body, td, th, h1, h2 {font-family: $ft_family;}
pre {margin: 0px; font-family: monospace;}
a:link {color: $lk_color; text-decoration: $lk_decoration; background-color: $lk_bgcolor;}
a:hover {text-decoration: $lk_hvdecoration;}
table {border-collapse: collapse;}
.center {text-align: center;}
.center table { margin-left: auto; margin-right: auto; text-align: left;}
.center th { text-align: center !important; }
td, th { border: 1px solid #000000; font-size: 75%; vertical-align: baseline;}
.modules table {border: 0;}
.modules td { border:0; font-size: 100%; vertical-align: baseline;}
.modules th { border:0; font-size: 100%; vertical-align: baseline;}
h1 {font-size: 150%;}
h2 {font-size: 125%;}
.p {text-align: left;}
.e {background-color: $leftcol_bgcolor; font-weight: bold; color: $leftcol_ftcolor;}
.h {background-color: $header_bgcolor; font-weight: bold; color: $header_ftcolor;}
.v {background-color: $rightcol_bgcolor; color: $rightcol_ftcolor;}
i {color: #666666; background-color: #cccccc;}
img {float: right; border: 0px;}
hr {width: 600px; background-color: #cccccc; border: 0px; height: 1px; color: #000000;}
//--></style>
<title>$title</title>
</head>
<body><div class="center">
END_OF_HTML
defined wantarray ? return $html : print $html;
}
sub links {
my $self = shift;
my $args = process_args(@_, \&check_args);
if (exists $args->{'0'}) {
$HTML::Perlinfo::Common::links{'all'} = 0;
}
elsif (exists $args->{'1'}) {
$HTML::Perlinfo::Common::links{'all'} = 1;
}
elsif (exists $args->{'docs'} && not exists $args->{'local'}) {
$HTML::Perlinfo::Common::links{'docs'} = $args->{'docs'};
}
elsif (exists $args->{'local'} && not exists $args->{'docs'}) {
$HTML::Perlinfo::Common::links{'local'} = $args->{'local'};
}
elsif (exists $args->{'docs'} && exists $args->{'local'}) {
$HTML::Perlinfo::Common::links{'docs'} = $args->{'docs'},
$HTML::Perlinfo::Common::links{'local'} = $args->{'local'},
}
}
1;

View File

@@ -0,0 +1,515 @@
package HTML::Perlinfo::Common;
use CGI qw(escapeHTML);
our @ISA = qw(Exporter);
our @EXPORT = qw(initialize_globals print_table_colspan_header print_table_row print_table_color_start print_table_color_end print_color_box print_table_row_color print_table_start print_table_end print_box_start print_box_end print_hr print_table_header print_section print_license add_link check_path check_args check_module_args perl_version release_date process_args error_msg match_string);
require Exporter;
use Carp ();
our %links;
%links = (
'all' => 1,
'local' => 0,
'docs' => 1,
);
##### The following is lifted from File::Which 0.05 by Per Einar Ellefsen.
##### The check_path sub uses the which sub.
#############
use File::Spec;
my $Is_VMS = ($^O eq 'VMS');
my $Is_MacOS = ($^O eq 'MacOS');
my $Is_DOSish = (($^O eq 'MSWin32') or
($^O eq 'dos') or
($^O eq 'os2'));
# For Win32 systems, stores the extensions used for
# executable files
# For others, the empty string is used
# because 'perl' . '' eq 'perl' => easier
my @path_ext = ('');
if ($Is_DOSish) {
if ($ENV{PATHEXT} and $Is_DOSish) { # WinNT. PATHEXT might be set on Cygwin, but not used.
push @path_ext, split ';', $ENV{PATHEXT};
}
else {
push @path_ext, qw(.com .exe .bat); # Win9X or other: doesn't have PATHEXT, so needs hardcoded.
}
}
elsif ($Is_VMS) {
push @path_ext, qw(.exe .com);
}
sub which {
my ($exec) = @_;
return undef unless $exec;
my $all = wantarray;
my @results = ();
# check for aliases first
if ($Is_VMS) {
my $symbol = `SHOW SYMBOL $exec`;
chomp($symbol);
if (!$?) {
return $symbol unless $all;
push @results, $symbol;
}
}
if ($Is_MacOS) {
my @aliases = split /\,/, $ENV{Aliases};
foreach my $alias (@aliases) {
# This has not been tested!!
# PPT which says MPW-Perl cannot resolve `Alias $alias`,
# let's just hope it's fixed
if (lc($alias) eq lc($exec)) {
chomp(my $file = `Alias $alias`);
last unless $file; # if it failed, just go on the normal way
return $file unless $all;
push @results, $file;
# we can stop this loop as if it finds more aliases matching,
# it'll just be the same result anyway
last;
}
}
}
my @path = File::Spec->path();
unshift @path, File::Spec->curdir if $Is_DOSish or $Is_VMS or $Is_MacOS;
for my $base (map { File::Spec->catfile($_, $exec) } @path) {
for my $ext (@path_ext) {
my $file = $base.$ext;
if ((-x $file or # executable, normal case
($Is_MacOS || # MacOS doesn't mark as executable so we check -e
($Is_DOSish and grep { $file =~ /$_$/i } @path_ext[1..$#path_ext])
# DOSish systems don't pass -x on non-exe/bat/com files.
# so we check -e. However, we don't want to pass -e on files
# that aren't in PATHEXT, like README.
and -e _)
) and !-d _)
{ # and finally, we don't want dirs to pass (as they are -x)
return $file unless $all;
push @results, $file; # Make list to return later
}
}
}
if($all) {
return @results;
} else {
return undef;
}
}
## End File::Which code
sub check_path {
return add_link('local', which("$_[0]")) if which("$_[0]");
return "<i>not in path</i>";
}
sub match_string {
my($module_name, $string) = @_;
my $result = 0;
my @string = (ref $string eq 'ARRAY') ? @$string : ($string);
foreach(@string) {
$result = index(lc($module_name), lc($_));
last if ($result != -1);
}
return ($result == -1) ? 0 : 1;
}
sub perl_version {
my $version;
if ($] >= 5.006) {
$version = sprintf "%vd", $^V;
}
else { # else time to update Perl!
$version = "$]";
}
return $version;
}
sub release_date {
# when things escaped
%released = (
5.000 => '1994-10-17',
5.001 => '1995-03-14',
5.002 => '1996-02-96',
5.00307 => '1996-10-10',
5.004 => '1997-05-15',
5.005 => '1998-07-22',
5.00503 => '1999-03-28',
5.00405 => '1999-04-29',
5.006 => '2000-03-22',
5.006001 => '2001-04-08',
5.007003 => '2002-03-05',
5.008 => '2002-07-19',
5.008001 => '2003-09-25',
5.009 => '2003-10-27',
5.008002 => '2003-11-05',
5.006002 => '2003-11-15',
5.008003 => '2004-01-14',
5.00504 => '2004-02-23',
5.009001 => '2004-03-16',
5.008004 => '2004-04-21',
5.008005 => '2004-07-19',
5.008006 => '2004-11-27',
5.009002 => '2005-04-01',
5.008007 => '2005-05-30',
5.009003 => '2006-01-28',
5.008008 => '2006-01-31',
5.009004 => '2006-08-15',
5.009005 => '2007-07-07',
5.010000 => '2007-12-18',
);
# Do we have Module::Corelist
eval{require Module::CoreList};
if ($@) { # no
return ($released{$]}) ? $released{$]} : "unknown";
}
else { # yes
return ($Module::CoreList::released{$]}) ? $Module::CoreList::released{$]} : "unknown";
}
}
sub check_args {
my ($key, $value) = @_;
my ($message, %allowed);
@allowed{qw(docs local 0 1)} = ();
if (not exists $allowed{$key}) {
$message = "$key is an invalid links parameter";
}
elsif ($key =~ /(?:docs|local)/ && $value !~ /^(?:0|1)$/i) {
$message = "$value is an invalid value for the $key parameter in the links attribute";
}
error_msg("$message") if $message;
}
sub check_module_args {
my ($key, $value) = @_;
my ($message, %allowed);
@allowed{qw(from columns sort_by color link show_only section full_page show_inc show_dir files_in)} = ();
if (not exists $allowed{$key}) {
$message = "$key is an invalid print_modules parameter";
}
elsif ($key eq 'sort_by' && $value !~ /^(?:name|version)$/i) {
$message = "$value is an invalid sort";
}
elsif ($key =~ /^(?:color|link|columns|files_in)$/ && ref($value) ne 'ARRAY') {
$message = "The $key parameter value is not an array reference";
}
elsif ($key eq 'columns' && grep(!/^(?:name|version|desc|path|core)$/, @{$value})) {
$message = "Invalid column name in the $key parameter";
}
elsif ($key eq 'color' && @{$value} <= 1) {
$message = "You didn't specify a module to color";
}
elsif ($key eq 'link' && @{$value} <= 1 && $value->[0] != 0) {
$message = "You didn't provide a URL for the $key parameter";
}
elsif ($key eq 'show_only' && (ref($value) ne 'ARRAY') && lc $value ne 'core') {
$message = "$value is an invalid value for the $key parameter";
}
elsif ($key eq 'full_page' && $value != 0 && $value != 1 ) {
$message = "$value is an invalid value for the $key parameter";
}
elsif ($key eq 'link' && ($value->[0] ne 'all' && $value->[0] != 0 && ref($value->[0]) ne 'ARRAY')) {
$message = "Invalid first element in the $key parameter value";
}
error_msg("$message") if $message;
}
sub process_args {
# This sub returns a hash ref containing param args
my %params;
my $sub = pop @_ || die "No coderef provided\n"; # get the sub
if (defined $_[0]) {
while(my($key, $value) = splice @_, 0, 2) {
$sub->($key, $value);
if (exists $params{$key}){
@key_value = ref(${$params{$key}}[0]) eq 'ARRAY' ? @{$params{$key}} : $params{$key};
push @key_value,$value;
$new_val = [@key_value];
$params{$key} = $new_val;
}
else {
$params{$key} = $value;
}
}
}
return \%params;
}
sub error_msg {
local $Carp::CarpLevel = $Carp::CarpLevel + 1;
Carp::croak "User error: $_[0]";
}
# HTML subs
sub print_table_colspan_header {
return sprintf("<tr class=\"h\"><th colspan=\"%d\">%s</th></tr>\n", $_[0], $_[1]);
}
sub print_table_row {
my $num_cols = $_[0];
my $HTML = "<tr>";
for ($i=0; $i<$num_cols; $i++) {
$HTML .= sprintf("<td class=\"%s\">", ($i==0 ? "e" : "v" ));
my $row_element = $_[$i+1];
if ((not defined ($row_element)) || ($row_element !~ /\S/)) {
$HTML .= "<i>no value</i>";
} else {
my $elem_esc;
if ($row_element eq "<i>no value</i>") {
$elem_esc = $row_element;
} else {
$elem_esc = escapeHTML($row_element);
}
$HTML .= "$elem_esc";
}
$HTML .= " </td>";
}
$HTML .= "</tr>\n";
return $HTML;
}
sub print_table_color_start {
return qq~<table class="modules" cellpadding=4 cellspacing=4 border=0 width="600"><tr>\n~;
}
sub print_table_color_end {
return '</tr></table>';
}
sub print_color_box {
return qq ~<td>
<table border=0>
<tr><td>
<table class="modules" border=0 width=50 height=50 align=left bgcolor="$_[0]">
<tr bgcolor="$_[0]">
<td color="$_[0]">
&nbsp;
</td>
</tr>
</table>
</tr></td>
<tr><td><small>$_[1]</small></td></tr>
</table>
</td>~;
}
sub print_table_row_color {
my $num_cols = $_[0];
my $HTML = $_[1] ? "<tr bgcolor=\"$_[1]\">" : "<tr>";
for ($i=0; $i<$num_cols; $i++) {
$HTML .= $_[1] ? "<td bgcolor=\"$_[1]\">" : sprintf("<td class=\"%s\">", ($i==0 ? "e" : "v" ));
my $row_element = $_[$i+2]; # start at the 2nd element
if ((not defined ($row_element)) || ($row_element !~ /\S/)) {
$HTML .= "<i>no value</i>";
} else {
my $elem_esc = $row_element;
$HTML .= "$elem_esc";
}
$HTML .= " </td>";
}
$HTML .= "</tr>\n";
return $HTML;
}
sub print_table_start {
return "<table border=\"0\" cellpadding=\"3\" width=\"600\">\n";
}
sub print_table_end {
return "</table><br />\n";
}
sub print_box_start {
my $HTML = print_table_start();
$HTML .= ($_[0] == 1) ? "<tr class=\"h\"><td>\n" : "<tr class=\"v\"><td>\n";
return $HTML;
}
sub print_box_end {
my $HTML = "</td></tr>\n";
$HTML .= print_table_end();
return $HTML;
}
sub print_hr {
return "<hr />\n";
}
sub print_table_header {
my($num_cols) = $_[0];
my $HTML = "<tr class=\"h\">";
my $i;
for ($i=0; $i<$num_cols; $i++) {
my $row_element = $_[$i+1];
$row_element = " " if (!$row_element);
$HTML .= "<th>$row_element</th>";
}
return "$HTML</tr>\n";
}
sub print_section {
return "<h2>" . $_[0] . "</h2>\n";
}
sub print_perl_license {
return <<"END_OF_HTML";
<p>
This program is free software; you can redistribute it and/or modify it under the terms of
either the Artistic License or the GNU General Public License, which may be found in the Perl 5 source kit.
</p>
<p>
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
</p>
<p>
Complete documentation for Perl, including FAQ lists, should be found on
this system using `man perl' or `perldoc perl'. If you have access to the
Internet, point your browser at @{[ add_link('same', 'http://www.perl.org/')]}, the Perl directory.
</p>
END_OF_HTML
}
sub print_license {
return join '', print_section("Perl License"),
print_box_start(0),
print_perl_license(),
print_box_end();
}
sub add_link {
my ($type, $value, $link) = @_;
return $value unless $links{'all'};
if ($type eq "cpan") {
return $value if $link && $link->[0] =~ /^[0]$/;
if ($link) {
if (ref $link->[0] eq 'ARRAY' && ref $link->[1] ne 'ARRAY') {
foreach (@{$link->[0]}) {
if ($_ eq 'all' or match_string($value,$_)==1) {
return '<a href=' . $link->[1] . $value .
qq~ title="Click here to see $value documentation [Opens in a new window]"
target="_blank">$value</a> ~
}
}
}
elsif (ref $link->[0] eq 'ARRAY' && ref $link->[1] eq 'ARRAY'){
foreach my $lv (@$link) {
if (ref $lv->[0] eq 'ARRAY') {
foreach(@{$lv->[0]}) {
if ($_ eq 'all' or match_string($value,$_)==1) {
return '<a href=' . $lv->[1] . $value .
qq~ title="Click here to see $value documentation [Opens in a new window]"
target="_blank">$value</a> ~
}
}
}
else {
if ($lv->[0] eq 'all' or match_string($value,$lv->[0])==1) {
return '<a href=' . $lv->[1] . $value .
qq~ title="Click here to see $value documentation [Opens in a new window]"
target="_blank">$value</a> ~
}
}
}
}
elsif ($link->[0] eq 'all' or match_string($value,$link->[0])==1) {
return '<a href=' . $link->[1] . $value .
qq~ title="Click here to see $value documentation [Opens in a new window]"
target="_blank">$value</a> ~
}
}
return qq~ <a href="http://search.cpan.org/perldoc?$value"
title="Click here to see $value on CPAN [Opens in a new window]" target="_blank">$value</a> ~;
}
elsif ($type eq "config") {
return $value unless $links{'docs'};
my ($letter) = $value =~ /^(.)/;
return qq! <a href="http://search.cpan.org/~aburlison/Solaris-PerlGcc-1.3/config/5.006001/5.10/sparc/Config.pm#$letter">$value</a> !;
}
elsif ($type eq "local") {
return $value unless $links{'local'};
return qq~ <a href="file://$value" title="Click here to see $value [Opens in a new window]" target="_blank">$value</a> ~;
}
elsif ($type eq "same") {
return qq~ <a href="$value">$value</a> ~;
}
}
1;

View File

@@ -0,0 +1,227 @@
package HTML::Perlinfo::General;
use base qw(Exporter);
our @EXPORT = qw(print_httpd print_thesemodules print_config print_variables print_general);
use CGI qw(url_param param);
use POSIX qw(uname);
use Config qw(%Config config_sh);
use Net::Domain qw(hostname);
use File::Spec;
use HTML::Perlinfo::Common;
use HTML::Perlinfo::Apache;
# Should search for Get, Post, Cookies, Session, and Environment.
sub perl_print_gpcse_array {
my $html;
my($name) = @_;
my ($gpcse_name, $gpcse_value);
#POST names should be param() and get in url_param()
if (defined($ENV{'QUERY_STRING'}) && length($ENV{'QUERY_STRING'}) >= 1) {
foreach(url_param()) { $html .= print_table_row(2, "GET[\"$_\"]", url_param($_)); }
}
if (defined($ENV{'CONTENT_LENGTH'})) {
foreach(param()) { $html .= print_table_row(2, "POST[\"$_\"]", param($_)); }
}
if (defined($ENV{'HTTP_COOKIE'})) {
$cookies = $ENV{'HTTP_COOKIE'};
@cookies = split(';',$cookies);
foreach (@cookies) {
($k,$v) = split('=',$_);
$html .= print_table_row(2, "COOKIE[\"$k\"]", "$v");
}
}
foreach my $key (sort(keys %ENV))
{
$gpcse_name = "$name" . '["' . "$key" . '"]';
if ($ENV{$key}) {
$gpcse_value = "$ENV{$key}";
} else {
$gpcse_value = "<i>no value</i>";
}
$html .= print_table_row(2, "$gpcse_name", "$gpcse_value");
}
return $html;
}
sub print_variables {
return join '', print_section("Environment"),
print_table_start(),
print_table_header(2, "Variable", "Value"),
((defined($ENV{'SERVER_SOFTWARE'})) ? perl_print_gpcse_array("SERVER") : perl_print_gpcse_array("ENV",)),
print_table_end();
}
sub print_config {
return join '', print_section("Perl Core"),
print_table_start(),
print_table_header(2, "Variable", "Value"),
print_config_sh($_[0]),
print_table_end();
}
sub print_config_sh {
my @configs = qw/api_versionstring cc ccflags cf_by cf_email cf_time extensions installarchlib installbin installhtmldir installhtmlhelpdir installprefix installprefixexp installprivlib installscript installsitearch installsitebin known_extensions libc libperl libpth libs myarchname optimize osname osvers package perllibs perlpath pm_apiversion prefix prefixexp privlib privlibexp startperl version version_patchlevel_string xs_apiversion/;
return ($_[0] eq 'info_all') ?
join '', map { print_table_row(2, add_link('config',$_), $Config{$_})} @configs
: join '', map { print_table_row(2, map{
if ($_ !~ /^'/) {
add_link('config', $_);
}
else {
if ($_ eq "\'\'" || $_ eq "\' \'" ) {
my $value = "<i>no value</i>";
$value;
}
else {
$_;
}
}
}split/=/,$_ )
} split /\n/, config_sh();
}
sub print_httpd {
return unless (defined $ENV{'SERVER_SOFTWARE'} && $ENV{'SERVER_SOFTWARE'} =~ /apache/i);
my $a = HTML::Perlinfo::Apache->new();
my $html .= print_section("Apache");
$html .= print_table_start();
$html .= $a->print_apache() if $a->has qw(env);
$html .= print_table_end();
$html .= $a->print_modperl() if $a->has qw(mp);
$html .= print_section("Apache Environment"),
$html .= print_table_start();
$html .= print_table_header(2, "Variable", "Value"),
$html .= $a->print_apache_environment() if $a->has qw(env);
$html .= print_table_end();
return $html;
}
sub print_thesemodules {
my $opt = shift;
$m = HTML::Perlinfo::Modules->new();
if ($opt eq 'core') {
return $m->print_modules(show_only=>$opt, full_page=>0);
}
elsif ($opt eq 'all') {
return $m->print_modules(section=>'All Perl modules', full_page=>0);
}
elsif ($opt eq 'loaded' && ref $_[0] eq 'ARRAY') {
return $m->print_modules(section=>'Loaded Modules', files_in=>shift, full_page=>0);
}
else {
die "internal function print_thesemodules has invalid arguments: @_";
}
}
sub print_general {
my $opt = shift || 'full';
my $html = print_box_start(1);
$html .= sprintf("<h1 class=\"p\">Perl Version %s</h1><br style=\"clear:both\" />Release date: %s", perl_version(), release_date());
$html .= print_box_end();
$html .= print_table_start();
$html .= print_table_row(2, "Currently running on", "@{[ (uname)[0..4] ]}");
$html .= print_table_row(2, "Built for", "$Config{archname}");
$html .= print_table_row(2, "Build date", "$Config{cf_time}");
$html .= print_table_row(2, "Perl path", "$^X");
$html .= print_table_row(2, "Additional C Compiler Flags", "$Config{ccflags}");
$html .= print_table_row(2, "Optimizer/Debugger Flags", "$Config{optimize}");
if (defined($ENV{'SERVER_SOFTWARE'})) {
$html .= print_table_row(2, "Server API", "$ENV{'SERVER_SOFTWARE'}");
}
if ($Config{usethreads} && !$Config{useithreads} && !$Config{use5005threads}) {
$html .= print_table_row(2, "Thread Support", "enabled (threads)");
}
elsif ($Config{useithreads} && !$Config{usethreads} && !$Config{use5005threads}) {
$html .= print_table_row(2, "Thread Support", "enabled (ithreads)");
}
elsif ($Config{use5005threads} && !$Config{usethreads} && !$Config{useithreads}) {
$html .= print_table_row(2, "Thread Support", "enabled (5005threads)");
}
elsif ($Config{usethreads} && $Config{useithreads} && !$Config{use5005threads}) {
$html .= print_table_row(2, "Thread Support", "enabled (threads, ithreads)");
}
elsif ($Config{usethreads} && $Config{use5005threads} && !$Config{useithreads}) {
$html .= print_table_row(2, "Thread Support", "enabled (threads, 5005threads)");
}
elsif ($Config{useithreads} && $Config{use5005threads} && !$Config{usethreads}) {
$html .= print_table_row(2, "Thread Support", "enabled (ithreads, 5005threads)");
}
elsif ($Config{usethreads} && $Config{useithreads} && $Config{use5005threads}) {
$html .= print_table_row(2, "Thread Support", "enabled (threads, ithreads, 5005threads)");
}
else {
$html .= print_table_row(2, "Thread Support", "disabled (threads, ithreads, 5005threads)");
}
$html .= print_table_end();
$html .= print_box_start(0);
$html .= "This is perl, v$Config{version} built for $Config{archname}<br />Copyright (c) 1987-@{[ sprintf '%d', (localtime)[5]+1900]}, Larry Wall";
$html .= print_box_end();
return $html if $opt eq 'top';
$html .= print_hr();
$html .= "<h1>Configuration</h1>\n";
$html .= print_config('info_all');
$html .= join '', print_section("Perl utilities"),
print_table_start(),
print_table_header(2, "Name", "Location"),
print_table_row(2, add_link('cpan', 'h2ph'), check_path("h2ph")),
print_table_row(2, add_link('cpan', 'h2xs'), check_path("h2xs")),
print_table_row(2, add_link('cpan', 'perldoc'), check_path("perldoc")),
print_table_row(2, add_link('cpan', 'pod2html'), check_path("pod2html")),
print_table_row(2, add_link('cpan', 'pod2latex'), check_path("pod2latex")),
print_table_row(2, add_link('cpan', 'pod2man'), check_path("pod2man")),
print_table_row(2, add_link('cpan', 'pod2text'), check_path("pod2text")),
print_table_row(2, add_link('cpan', 'pod2usage'), check_path("pod2usage")),
print_table_row(2, add_link('cpan', 'podchecker'), check_path("podchecker")),
print_table_row(2, add_link('cpan', 'podselect'), check_path("podselect")),
print_table_end(),
print_section("Mail"),
print_table_start(),
print_table_row(2, 'SMTP', hostname()),
print_table_row(2, 'sendmail_path', check_path("sendmail")),
print_table_end(),
print_httpd(),
print_section("HTTP Headers Information"),
print_table_start(),
print_table_colspan_header(2, "HTTP Request Headers");
if (defined($ENV{'SERVER_SOFTWARE'})) {
$html .= join '',
print_table_row(2, 'HTTP Request', "$ENV{'REQUEST_METHOD'} @{[File::Spec->abs2rel($0)]} $ENV{'SERVER_PROTOCOL'}"),
print_table_row(2, 'Host', $ENV{'HTTP_HOST'}),
print_table_row(2, 'User-Agent', $ENV{'HTTP_USER_AGENT'}),
print_table_row(2, 'Accept', $ENV{'HTTP_ACCEPT_ENCODING'}),
print_table_row(2, 'Accept-Language', $ENV{'HTTP_ACCEPT_LANGUAGE'}),
print_table_row(2, 'Accept-Charset', $ENV{'HTTP_ACCEPT_CHARSET'}),
print_table_row(2, 'Keep-Alive', $ENV{'HTTP_KEEP_ALIVE'}),
print_table_row(2, 'Connection', $ENV{'HTTP_CONNECTION'});
}
$html .= print_table_end();
return $html;
}
1;

View File

@@ -0,0 +1,147 @@
=pod
=head1 NAME
HTML::Perlinfo::HTML - HTML documentation for the perlinfo library
=head1 SUMMARY
HTML::Perlinfo validates as XHTML 1.0 Transitional.
In the perlinfo library, L<HTML::Perlinfo> and L<HTML::Perlinfo::Modules> use the internal module HTML::Perlinfo::Common for HTML generation. This document provides information on that HTML and its manipulation.
=head1 CUSTOMIZING THE HTML
You can capture the HTML output by assigning it to a scalar. Then you can alter the HTML before printing it or doing something else with it. Here is an example that uses the perlinfo function from L<HTML::Perlinfo>:
use HTML::Perlinfo;
my $example = perlinfo(); # Now I can do whatever I want with $example
$example =~ s/Perl/Java/ig; # Make everyone laugh
print $example;
Another option is to use object attributes which make altering some HTML elements less helter skelter.
=head1 OBJECT ATTRIBUTES
These object attributes allow you to change the HTML CSS settings to achieve a stylish effect. Please see your favorite HTML guide for acceptable CSS values. Refer to the HTML source code of the perlinfo page for the defaults.
Attribute name/Corresponding CSS element
title / page title (only non-CSS element)
bg_image / background_image
bg_position / background_position
bg_repeat / background_repeat
bg_attribute / background_attribute
bg_color / background_color
ft_family / font_familty
ft_color / font_color
lk_color / link color
lk_decoration / link text-decoration
lk_bgcolor / link background-color
lk_hvdecoration / link hover text-decoration
header_bgcolor / table header background-color
header_ftcolor / table header font color
leftcol_bgcolor / background-color of leftmost table cell
leftcol_ftcolor / font color of left table cell
rightcol_bgcolor / background-color of right table cell
rightcol_ftcolor / font color of right table cell
=head2 CSS EXAMPLE
$p = HTML::Perlinfo->new(
bg_image => 'http://i104.photobucket.com/albums/m176/perlinfo/camel.gif',
bg_repeat => 'yes-repeat'
);
$p->info_all;
=head1 print_htmlhead
This method prints the head container tags containing the style sheet, along with a few other html tags. It is useful to call this method when full_page is set to 0 and you are piecing together multiple perlinfo pages into one page. For example:
$m = HTML::Perlinfo::Modules->new( full_page => 0 ); # Just the bare essentials please
$m->print_htmlhead; # Print the beginning of an html document
$m->print_modules( from =>'/home/paco',
section => 'The Modules in Paco's Home Directory'
);
$m->print_modules( from =>'/home/cowboy',
section => 'The Modules in Cowboy's Home Directory'
);
When full_page is set to 1 (the default value), print_htmlhead is called internally. Note that you can still set CSS values in the constructor even when full_page is set to 0 and see the results in print_htmlhead.
$m = HTML::Perlinfo::Modules->new(
full_page => 0,
bg_color => 'gray'
);
$m->print_htmlhead; # Prints a HTML document with a gray background
Of course, you don't have to use the print_htmlhead method. You could insert your own HTML with your own style sheet when you set full_page to 0.
=head1 full_page
Do you want only a fragment of HTML and not a page with body tags (among other things)? Then the full_page option is what you need to use (or a regular expression, as explained above). This option allows you to add your own header/footer if you so desire. By default, the value is 1. Set it to 0 to output the HTML report with as little HTML as possible.
$p = HTML::Perlinfo->new( full_page => 0 ); # Change value to 1 to get a full HTML page
=head1 links
By default, there will be useful links in most of the presented HTML in the perlinfo library. These links are for pages on search.cpan.org. Even the info_config method lists links to the config options in the core Config module.
To manipulate links in the perlinfo library, you can use the links attribute in the info methods. Not to be confused with the "link" attribute in the HTML::Perlinfo::Module (which allows you to provide your own links for modules), this attribute's primary purpose is to turn on linking or turn it off. Of course, you can achieve the same effect by using regular expressions, as explained above. But using the links attribute makes your code cleaner.
There are several arguments (in an array reference) you can supply to the links attribute.
The number 1 turns on all default links and 0 will remove them.
For example, to remove the default links in the info_all method, you would say:
$p->info_all( links=>[0] ); # contains no links. Good for printing!
The example above removes all default links and it even ignores the L<link|HTML::Perlinfo::Modules#link> parameter in the print_modules method of L<HTML::Perlinfo::Modules>.
The named parameters for the links attribute are 'docs' and 'local' which controls links associated with modules, programs in the Perl utilities section, and the Config section and everywhere else. The value for either parameter can be either '1' or '0'.
=over
=item docs
Using 'docs', you can control the display of the default links to module and program documentation on search.cpan.org. But the link parameter in L<HTML::Perlinfo::Modules> can override this directive. By overridding 'docs=>0', you can show documentation for certain modules and not show documentation for any others. This is useful, for example, when you have homegrown modules without any documentation but want to show links to documentation for CPAN modules on the same page. Observe:
$p->print_modules( links => [docs=>0], link => [qr/Apache::/, 'http://www.myexample.com/perldoc/'] );
In the above example, only links to Apache modules would appear. Other modules would not have links to any documentation. Note that had you simply set the value for links to zero, then the other attribute concerning Apache modules would have been irrelevant, since no links whatsoever would have appeared. In other words, you can mix and match these two atttibutes to achieve many different and wonderous effects. Have fun! Be imaginative!
For more information on print_modules and its link parameter, please see L<HTML::Perlinfo::Modules>.
=item local
With the 'local' parameter set to 1, the local location of a module or program will be a link. This is useful if you want to see the local installation directory of a module in your browser. (From there, you could also look at the contents of said files.)
Note that this link would only work if you use the perlinfo library from the command-line and then view the resulting page on the same machine. Hence these local links are not present by default.
You can even use 'docs' along with 'local'.
$p->info_all( links => [docs=>0,local=>1] )
=back
=head1 NOTES
L<HTML::Perlinfo::Modules> allows you to color code specific modules.
More HTML options should be available in future revisions. Want to see a new feature/change? Then contact me about it.
=head1 AUTHOR
Mike Accardo <mikeaccardo@yahoo.com>
=head1 COPYRIGHT
Copyright (c) 2009, Mike Accardo. All Rights Reserved.
=cut

View File

@@ -0,0 +1,66 @@
package HTML::Perlinfo::Loaded;
BEGIN { %Seen = %INC }
use HTML::Perlinfo::Modules;
$VERSION = '1.02';
%INC = %Seen;
END {
delete $INC{'HTML/Perlinfo/Loaded.pm'};
my $m = HTML::Perlinfo::Modules->new(full_page=>0, title=>'perlinfo(INFO_LOADED)');
$m->print_htmlhead;
$m->print_modules('files_in'=>[values %INC],'section'=>'Loaded Modules');
print $m->info_variables,"</div></body></html>";
}
1;
__END__
=pod
=head1 NAME
HTML::Perlinfo::Loaded - Post-execution HTML dump of loaded modules and environment variables
=head1 SYNOPSIS
#!/usr/bin/perl
use HTML::Perlinfo::Loaded;
...
=head1 DESCRIPTION
This module installs an at-exit handler to generate an HTML dump of all the module files used by a Perl program. As an added bonus, environment variables are also included in this dump. When used under mod_perl, the module will show you preloaded modules in the HTML page too.
Since the "dump" is a complete HTML page, this module is a good debugging tool for Web applications. Just make sure you print the content-type header beforehand or you will get an internal server error (malformed header).
Note that the HTML::Perlinfo function 'perlinfo' has an option called INFO_LOADED that will produce the same result. In other words, there is more than one way to do it! Observe:
use HTML::Perlinfo;
perlinfo(INFO_LOADED);
The result will be the same if you say:
#!/usr/bin/perl
use HTML::Perlinfo::Loaded;
...
There is no difference, except using the perlinfo option gives you greater control. You could always control HTML::Perlinfo::Loaded with a pound sign (a comment on/off), but if you are using mod_perl it makes more sense to add HTML::Perlinfo to your startup file and then call perlinfo(INFO_LOADED) when you want to dump.
=head1 SEE ALSO
L<Devel::Loaded>, L<HTML::Perlinfo::Modules>, L<HTML::Perlinfo>, L<perlinfo>
=head1 AUTHOR
Mike Accardo <mikeaccardo@yahoo.com>
=head1 COPYRIGHT
Copyright (c) 2008, Mike Accardo. All Rights Reserved.
This module is free software. It may be used, redistributed
and/or modified under the terms of the Perl Artistic License.
=cut

View File

@@ -0,0 +1,753 @@
package HTML::Perlinfo::Modules;
use strict;
use File::Find;
use File::Spec;
use Carp ();
use Config qw(%Config);
use base qw(HTML::Perlinfo::Base);
use CGI qw(escapeHTML);
use HTML::Perlinfo::Common;
our $VERSION = '1.17';
sub new {
my ($class, %params) = @_;
$params{'title'} = exists $params{'title'} ? $params{'title'} : 'Perl Modules';
$class->SUPER::new(%params);
}
sub module_color_check {
my ($module_name, $color_specs) = @_;
if (defined $color_specs && ref($color_specs->[0]) eq 'ARRAY') {
foreach (@{ $color_specs }) {
return $_->[0] if (match_string($module_name,$_->[1])==1);
}
}
else {
return $color_specs->[0] if (defined $color_specs && match_string($module_name,$color_specs->[1])==1);
}
return 0;
}
# get_modinfo
# This sub was created for the files_in option.
# Returns found_mod reference
######################################
sub get_files_in {
my ($file_path) = @_;
return 0 unless $file_path =~ m/\.pm$/;
my $mod_info = module_info($file_path, undef);
return $mod_info;
}
sub sort_modules {
my ($modules, $sort_by) = @_;
my @sorted_modules;
if ($sort_by eq 'name') {
foreach my $key (sort {lc $a cmp lc $b} keys %$modules) {
# Check for duplicate modules
if (ref($modules->{$key}) eq 'ARRAY') {
foreach (@{ $modules->{$key} }) {
push @sorted_modules, $_;
}
}
else {
push @sorted_modules, $modules->{$key};
}
}
}
elsif ($sort_by eq 'version') {
foreach my $key (keys %$modules) {
if (ref($modules->{$key}) eq 'ARRAY') {
@{ $modules->{$key} } = sort {$a->{'version'} cmp $b->{'version'}}@{ $modules->{$key} };
for (@{ $modules->{$key}}) {
push @sorted_modules, $_;
}
}
else {
push @sorted_modules, $modules->{$key};
}
}
@sorted_modules = sort {$a->{'version'} cmp $b->{'version'}}@sorted_modules;
}
return @sorted_modules;
}
sub html_setup {
my ($self, $columns, $color_specs, $section, $full_page) = @_;
my $html;
$html .= $self->print_htmlhead if $full_page;
my %show_columns = (
'name' => 'Module name',
'version' => 'Version',
'path' => 'Location',
'core' => 'Core',
'desc' => 'Description'
);
$html .= $section ? print_section($section) : '';
$html .= print_color_codes($color_specs) if $color_specs && $color_specs->[2];
$html .= print_table_start();
$html .= print_table_header(scalar @$columns, map{ $show_columns{$_} }@$columns);
return $html;
}
sub module_info {
my ($module_path, $show_only) = @_;
( $module_path ) = $module_path =~ /^(.*)$/;
my ($mod_name, $mod_version, $mod_desc);
no warnings 'all'; # silence warnings
open(MOD, $module_path) or return 0;
while (<MOD>) {
unless ($mod_name) {
if (/^ *package +(\S+);/) {
$mod_name = $1;
}
}
unless ($mod_version) {
if (/([\$*])(([\w\:\']*)\bVERSION)\b.*\=/) {
my $line = substr $_, index($_, $1);
my $eval = qq{
package HTML::Perlinfo::_version;
no strict;
local $1$2;
\$$2=undef; do {
$line
}; \$$2
};
( $eval ) = $eval =~ /^(.*)$/sm;
$mod_version = eval($eval);
# Again let us be nice here.
$mod_version = '<i>unknown</i>' if (not defined $mod_version) || ($@);
$mod_version =~ s/^\s+|\s+$//;
}
}
unless ($mod_desc) {
if (/=head\d\s+NAME/) {
local $/ = '';
local $_;
chomp($_ = <MOD>);
($mod_desc) = /^.*?-+\s*(.*?)$/ism;
}
}
last if $mod_name && $mod_version && $mod_desc;
}
close (MOD);
return 0 if (! $mod_name || $show_only && ref $show_only && (match_string($mod_name, $show_only) == 0));
$mod_version = '<i>unknown</i>' if !($mod_version) || ($mod_version !~ /^[\.\d+_]+$/);
$mod_desc = escapeHTML($mod_desc) if $mod_desc;
$mod_desc = "<i>No description found</i>" unless $mod_desc;
return { 'name' => $mod_name, 'version' => $mod_version, 'desc' => $mod_desc };
}
sub print_color_codes {
my $color_specs = shift;
my ($html, $label);
$html .= print_table_start();
$html .= print_table_header(1, "Module Color Codes");
$html .= print_table_color_start();
if (ref($color_specs->[0]) eq 'ARRAY') {
my $count = 0;
foreach (@{ $color_specs }) {
$html .= "<tr>" if $count++ % 5 == 0;
$label = $_->[2] || $_->[1];
$html .= print_color_box($_->[0], $label);
$html .= "</tr>" if (($count >= 5 && $count % 5 == 0)||($count >= @{$color_specs}));
}
}
else {
$label = $color_specs->[2] || $color_specs->[1];
$html .= print_color_box($color_specs->[0], $label);
}
$html .= print_table_color_end();
$html .= print_table_end();
return $html;
}
sub print_module_results {
my ($mod_dir, $mod_count, $from, $overall_total, $show_dir) = @_;
my ($html, $total_amount, $searched, @mod_dir, @bad_dir, %seen);
if ($show_dir) {
$html .= print_table_start();
$html .= print_table_header(2, "Directory", "Number of Modules");
for my $dir (keys %{$mod_count}) {
my $amount_found = $mod_count->{$dir};
push (@mod_dir, $dir) if $amount_found;
}
for my $dir1 (@mod_dir) {
for my $dir2 (@mod_dir) {
if ($dir1 ne $dir2 && $dir2 =~ /^$dir1/) {
push @bad_dir, $dir2;
}
}
}
for my $top_dir (@mod_dir) {
unless (grep{$_ eq $top_dir }@bad_dir) {
$html .= print_table_row(2, add_link('local', File::Spec->canonpath($top_dir)), $mod_count->{$top_dir});
}
}
$html .= print_table_end();
}
else {
# Print out directories not in @INC
@mod_dir = grep { -d $_ && -r $_ && !$seen{$_}++ } map {File::Spec->canonpath($_)}@INC;
my @module_paths = grep { not exists $seen{$_} }@$mod_dir;
if (@module_paths >= 1) {
$html .= print_table_start();
$html .= print_table_header(3, "Directory", "Searched", "Number of Modules");
for my $dir (map{ File::Spec->canonpath($_) }@module_paths) {
$searched = (grep { $_ eq $dir } @$mod_dir) ? "yes" : "no";
my $amount_found = ($searched eq 'yes') ? $mod_count->{$dir} : '<i>unknown</i>';
$html .= print_table_row(3, add_link('local', File::Spec->canonpath($dir)), $searched, $amount_found);
}
$html .= print_table_end();
}
$html .= print_table_start();
$html .= print_table_header(3, "Include path (INC) directories", "Searched", "Number of Modules");
for my $dir (@mod_dir) {
$searched = exists $mod_count->{$dir} ? 'yes' : 'no';
my $amount_found = ($searched eq 'yes') ? $mod_count->{$dir} : '<i>unknown</i>';
$html .= print_table_row(3, add_link('local', File::Spec->canonpath($dir)), $searched, $amount_found);
}
$html .= print_table_end();
}
$html .= print_table_start();
#my $view = ($from eq 'all') ? 'installed' :
# ($from eq 'core') ? 'core' : 'found';
$html .= print_table_row(2, "Total modules", $overall_total);
$html .= print_table_end();
return $html;
}
sub search_dir {
my ($from, $show_only, $core_dir1, $core_dir2) = @_;
my %seen = ();
my @user_dir = (ref($from) eq 'ARRAY') && $show_only ne 'core' ? @{$from} :
($show_only eq 'core') ? ($core_dir1, $core_dir2) : $from;
# Make sure only unique entries and readable directories in @mod_dir
my @mod_dir = grep { -d $_ && -r $_ && !$seen{$_}++ } map {File::Spec->canonpath($_)}@user_dir;
if (@mod_dir != @user_dir) {
# Looks like there might have been a problem with the directories given to us.
# Or maybe not. @user_dir could have duplicate values and that's ok.
# But let's still warn about any unreadable or non-directories given
my @debug;
%seen = ();
@user_dir = grep { !$seen{$_}++ } map {File::Spec->canonpath($_)}@user_dir;
if (@user_dir > @mod_dir) {
#%seen = map {$_ => undef} @mod_dir;
%seen = ();
@seen{@mod_dir} = ();
my @difference = grep { !$seen{$_}++ }@user_dir;
foreach my $element (@difference) {
if (! -d $element) {
if ( grep {$_ eq $element} map {File::Spec->canonpath($_)}@INC) {
warn "$element is in the Perl include path, but is not a directory";
}
else {
warn "$element is not a directory";
}
push @debug, $element;
}
elsif (! -r $element) {
if ( grep {$_ eq $element} map {File::Spec->canonpath($_)}@INC) {
warn "$element is in the Perl include path, but is not readable";
}
else {
warn "$element is not a readable directory";
}
push @debug, $element;
}
}
}
}
error_msg("Search directories are invalid") unless @mod_dir >= 1;
return @mod_dir;
}
sub get_input {
my $self = shift;
my $args = process_args(@_, \&check_module_args);
my %input = ();
$input{'files_in'} = $args->{'files_in'} || undef;
$input{'sort_by'} = $args->{'sort_by'} || 'name';
$input{'from'} = $args->{'from'} || \@INC;
$input{'show_only'} = $args->{'show_only'} || "";
$input{'color_specs'} = $args->{'color'};
$input{'link'} = $args->{'link'};
$input{'section'} = exists $args->{'section'} ? $args->{'section'} :
$input{'show_only'} eq 'core' ? 'Core Perl modules installed' : '';
$input{'full_page'} = exists $args->{'full_page'} ? $args->{'full_page'} : $self->{'full_page'};
$input{'show_inc'} = exists $args->{'show_inc'} ? $args->{'show_inc'} : 1;
$input{'show_dir'} = exists $args->{'show_dir'} ? $args->{'show_dir'} : 0;
$input{'columns'} = exists $args->{'columns'} ? $args->{'columns'} : ['name','version','desc'];
return %input;
}
sub print_modules {
my %input = get_input(@_);
my ($found_mod, $mod_count, $overall_total, @mod_dir, $core_dir1, $core_dir2);
# Check to see if a search is even needed
if (defined $input{'files_in'}) {
my @files = @{ $input{'files_in'} };
my %found_mod = ();
foreach my $file_path (@files) {
my $mod_info = get_files_in($file_path);
next unless (ref $mod_info eq 'HASH');
$found_mod{$mod_info->{'name'}} = $mod_info;
}
return undef unless (keys %found_mod > 0);
$found_mod = \%found_mod;
}
else {
# Get ready to search
$core_dir1 = File::Spec->canonpath($Config{installarchlib});
$core_dir2 = File::Spec->canonpath($Config{installprivlib});
@mod_dir = search_dir($input{'from'}, $input{'show_only'}, $core_dir1, $core_dir2);
($overall_total, $found_mod, $mod_count) = find_modules($input{'show_only'}, \@mod_dir);
return undef unless $overall_total;
}
my @sorted_modules = sort_modules($found_mod, $input{'sort_by'});
my $html .= html_setup( $_[0],
$input{'columns'},
$input{'color_specs'},
$input{'section'},
$input{'full_page'}
);
my $numberof_columns = scalar @{$input{'columns'}};
foreach my $module (@sorted_modules) {
$html .= print_table_row_color( $numberof_columns,
module_color_check($module->{'name'}, $input{'color_specs'}),
map{
if ($_ eq 'name') {
add_link('cpan', $module->{'name'}, $input{'link'});
}
elsif ($_ eq 'core') {
(grep File::Spec->rel2abs($module->{'path'}) =~ /\Q$_/, ($core_dir1, $core_dir2)) ? 'yes' : 'no';
}
elsif ($_ eq 'path') {
add_link('local', $module->{'path'});
}
else {
$module->{$_};
}
} @{$input{'columns'}} );
}
$html .= print_table_end();
unless (defined $input{'files_in'} && ref $input{'files_in'} eq 'ARRAY') {
$html .= print_module_results( \@mod_dir,
$mod_count,
$input{'from'},
$overall_total,
$input{'show_dir'}) if $input{'show_inc'};
}
$html .= "</div></body></html>" if $input{'full_page'};
defined wantarray ? return $html : print $html;
}
sub find_modules {
my ($show_only, $mod_dir) = @_;
my ($overall_total, $module, $base, $start_dir, $new_val, $mod_info);
# arrays
my (@modinfo_array, @mod_dir);
# hashes
my ( %path, %inc_path, %mod_count, %found_mod);
@mod_dir = @$mod_dir;
@path{@mod_dir} = ();
@inc_path{@INC} = ();
for $base (@mod_dir) {
find ({ wanted => sub {
for (@INC, @mod_dir) {
if (index($File::Find::name, $_) == 0) {
# lets record it unless we already have hit the dir
$mod_count{$_} = 0 unless exists $mod_count{$_};
}
}
# This prevents mod_dir dirs from being searched again when you have a dir within a dir
$File::Find::prune = 1, return if exists $path{$File::Find::name} && $File::Find::name ne $File::Find::topdir;
# make sure we are dealing with a module
return unless $File::Find::name =~ m/\.pm$/;
$mod_info = module_info($File::Find::name, $show_only);
return unless ref ($mod_info) eq 'HASH';
# update the counts.
for (@INC, grep{not exists $inc_path{$_}}@mod_dir) {
if (index($File::Find::name, $_) == 0) {
$mod_count{$_}++;
}
}
$overall_total++;
$mod_info->{'path'} = File::Spec->canonpath($File::Find::dir);
# Check for duplicate modules
if (exists $found_mod{$mod_info->{'name'}}) {
@modinfo_array = ref( $found_mod{$mod_info->{'name'}} ) eq 'ARRAY' ? @{$found_mod{$mod_info->{'name'}}} : $found_mod{$mod_info->{'name'}};
push @modinfo_array, $mod_info;
$new_val = [@modinfo_array];
$found_mod{$mod_info->{'name'}} = $new_val;
}
else {
$found_mod{$mod_info->{'name'}} = $mod_info;
}
},untaint => 1, untaint_pattern => qr|^([-+@\s\S\w./]+)$|}, $base);
} # end of for loop
return ($overall_total, \%found_mod, \%mod_count);
}
1;
__END__
=pod
=head1 NAME
HTML::Perlinfo::Modules - Display a lot of module information in HTML format
=head1 SYNOPSIS
use HTML::Perlinfo::Modules;
my $m = HTML::Perlinfo::Modules->new();
$m->print_modules;
=head1 DESCRIPTION
This module outputs information about your Perl modules in HTML. The information includes a module's name, version, description and location. The HTML presents the module information in B<two sections>, one section is a list of modules and the other is a summary of this list. Both the list and its summary are configurable.
Other information displayed:
- Duplicate modules. So if you have CGI.pm installed in different locations, these duplicate modules will be shown.
- Automatic links to module documentation on CPAN (you can also provide your own URLs).
- The number of modules under each directory.
You can chose to show 'core' modules or you can search for specific modules. You can also define search paths. HTML::Perlinfo::Modules searches the Perl include path (from @INC) by default. You can also highlight specific modules with different colors.
=head1 METHODS
=head2 print_modules
This is the key method in this module. It accepts optional named parameters that dictate the display of module information. The method returns undefined if no modules were found. This means that you can write code such as:
my $modules = $m->print_modules(from=>'/home/paco');
if ($modules) {
print $modules;
}
else {
print "No modules are in Paco's home directory!";
}
The code example above will show you the modules in Paco's home directory if any are found. If none are found, the code prints the message in the else block. There is a lot more you can do with the named parameters, but you do not have to use them. For example:
$m->print_modules;
# The above line is the equivalent of saying:
$m->print_modules(
from => \@INC,
columns => ['name','version','desc'],
sort_by => 'name',
show_inc => 1
);
# Alternatively, in this case, you could use a HTML::Perlinfo method to achieve the same result.
# Note that HTML::Perlinfo::Modules inherits all of the HTML::Perlinfo methods
$m->info_modules;
The optional named parameters for the print_modules method are listed below.
=head3 from
Show modules from specific directories.
This parameter accepts 2 things: a single directory or an array reference (containing directories).
The default value is the Perl include path. This is equivalent of supplying \@INC as a value. If you want to show all of the modules on your box, you can specify '/' as a value (or the disk drive on Windows).
=head3 files_in
If you don't need to search for your files and you already have the B<complete pathnames> to them, then you can use the 'files_in' option which accepts an array reference containing the files you wish to display. One obvious use for this option would be in displaying the contents of the INC hash, which holds the modules used by your Perl module or script:
$m->print_modules('files_in'=>[values %INC]);
This is the same technique used by the L<HTML::Perlinfo::Loaded> module which performs a post-execution HTML dump of your loaded modules. See L<HTML::Perlinfo::Loaded> for details.
=head3 columns
This parameter allows you to control the table columns in the list of modules. With this parameter, you can dictate which columns will be shown and their order. Examples:
# Show only module names
columns=>['name']
# Show version numbers before names
columns=>['version','name']
# Default columns are:
columns=>['name','version','desc']
The column parameter accepts an array reference containing strings that represent the column names. Those names are:
=over
=item name
The module name. This value is the namespace in the package declaration. Note that the method for retrieving the module name is not fool proof, since a module file can have multiple package declarations. HTML::Perlinfo::Modules grabs the namespace from the first package declaration that it finds.
=item version
The version number. Divines the value of $VERSION.
=item desc
The module description. The description is from the POD. Note that some modules don't have POD (or have POD without a description) and, in such cases, the message "No description found" will be shown.
=item path
The full path to the module file on disk. Printing out the path is especially useful when you want to learn the locations of duplicate modules.
B<Note that you can make this path a link.> This is useful if you want to see the local installation directory of a module in your browser. (From there, you could also look at the contents of the files.) Be aware that this link would only work if you use this module from the command-line and then view the resulting page on the same machine. Hence these local links are not present by default. To learn more about local links, please refer to the L<HTML documentation|HTML::Perlinfo::HTML>.
=item core
This column value (either 'yes' or 'no') will tell you if the module is core. In other words, it will tell you if the module was included in your Perl distribution. If the value is 'yes', then the module lives in either the installarchlib or the installprivlib directory listed in the config file.
=back
=head3 sort_by
You use this parameter to sort the modules. Values can be either 'version' for version number sorting (in descending order) or 'name' for alphabetical sorting (the default).
=head3 show_only
This parameter acts like a filter and only shows you the modules (more specifically, the package names) you request. So if, for example, you wanted to only show modules in the Net namspace, you would use the show_only parameter. It is probably the most useful option available for the print_modules method. With this option, you can use HTML::Perlinfo::Modules as a search engine tool for your local Perl modules. Observe:
$m->print_modules(
show_only => ['MYCOMPANY::'],
section => 'My Company's Custom Perl Modules',
show_dir => 1
);
The example above will print out every module in the 'MYCOMPANY' namespace in the Perl include path (@INC). The list will be entitled 'My Company's Custom Perl Modules' and because show_dir is set to 1, the list will only show the directories in which these modules were found along with how many are present in each directory.
You can add namespaces to the array reference:
$m->print_modules(
show_only => ['MYCOMPANY::', 'Apache::'],
section => 'My Company's Custom Perl Modules & Apache Modules',
show_dir => 1
);
In addition to an array reference, show_only also accepts the word 'core', a value that will show you all of the core Perl modules (in the installarchlib and installprivlib directories from the config file).
=head3 show_inc
Whenever you perform a module search, you will see a summary of your search that includes the directories searched and the number of modules found. Whether or not your search encompasses the Perl include path (@INC), you will still see these directories, along with any other directories that were actually searched. If you do not what to see this search summary, you must set show_inc to 0. The default value is 1.
=head3 show_dir
The default value is 0. Setting this parameter to 1 will only show you the directories in which your modules were found (along with a summary of how many were found, etc). If you do not want to show a search summary, then you must use the show_inc parameter.
=head3 color
This parameter allows you to highlight modules with different colors. Highlighting specific modules is a good way to draw attention to them.
The parameter value must be an array reference containing at least 2 elements. The first element is the color itself which can be either a hex code like #FFD700 or the name of the color. The second element specifies the module(s) to color. And the third, optional element, in the array reference acts as a label in the color code section. This final element can even be a link if you so desire.
Examples:
color => ['red', 'Apache::'],
color => ['#FFD700', 'CGI::']
Alternatively, you can also change the color of the rows, by setting CSS values in the constructor. For example:
$m = HTML::Perlinfo::Modules->new(
leftcol_bgcolor => 'red',
rightcol_bgcolor => 'red'
);
$m->print_modules(
show_only => 'CGI::',
show_inc => 0
);
# This next example does the same thing, but uses the color parameter in the print_modules method
$m = HTML::Perlinfo::Modules->new();
$m->print_modules(
show_only => ['CGI::'],
color => ['red', 'CGI::'],
show_inc => 0
);
The above example will yield the same HTML results. So which approach should you use? The CSS approach gives you greater control of the HTML presentation. The color parameter, on the other hand, only affects the row colors in the modules list. You cannot achieve that same effect using CSS. For example:
$m->print_modules( color => ['red', 'CGI::'], color => ['red', 'Apache::'] );
The above example will list B<all of the modules> in @INC with CGI modules colored red and Apache modules colored blue.
For further information on customizing the HTML, including setting CSS values, please refer to the L<HTML documentation|HTML::Perlinfo::HTML>.
=head3 section
The section parameter lets you put a heading above the module list. Example:
$m->print_modules(
show_only => ['Apache::'],
section => 'Apache/mod_perl modules',
show_dir => 1,
);
=head3 full_page
Do you want only a fragment of HTML and not a page with body tags (among other things)? Then the full_page option is what you need to use (or a regular expression, as explained in the L<HTML documentation|HTML::Perlinfo::HTML>). This option allows you to add your own header/footer if you so desire. By default, the value is 1. Set it to 0 to output the HTML report with as little HTML as possible.
$m = HTML::Perlinfo::Modules->new( full_page => 0 );
# You will still get an HTML page but without CSS settings or body tags
$m->print_modules;
$m->print_modules( full_page => 1 ); # Now you will get the complete, default HTML page.
Note that the full_page option can be set in either the constructor or the method call. The advantage of setting it in the constructor is that every subsequent method call will have this attribute. (There is no limit to how many times you can call print_modules in a program. If calling the method more than once makes no sense to you, then you need to look at the show_only and from options.) If you set the full_page in the print_modules method, you will override its value in the object.
=head3 link
By default, every module is linked to its documentation on search.cpan.org. However some modules, such as custom modules, would not be in CPAN and their link would not show any documentation. With the 'link' parameter you can override the CPAN link with you own URL.
The parameter value must be an array reference containing two elements. The first element can either be a string specifying the module(s) to link or an array reference containing strings or the word 'all' which will link all the modules in the list. The second element is the root URL. In the link, the module name will come after the URL. So in the example below, the link for the Apache::Status module would be 'http://www.myexample.com/perldoc/Apache::Status'.
link => ['Apache::', 'http://www.myexample.com/perldoc/']
# Another example
my $module = HTML::Perlinfo::Modules
->new
->print_modules( show_only => ['CGI::','File::','HTML::'],
link => ['HTML::', 'http://www.html-example.com/perldoc/'],
link => [['CGI::','File::'], 'http://www.examples.com/perldoc/'] );
Further information about linking is in the L<HTML documentation|HTML::Perlinfo::HTML>.
=head1 CUSTOMIZING THE HTML
HTML::Perlinfo::Modules uses the same HTML generation as its parent module, HTML::Perlinfo.
You can capture the HTML output and manipulate it or you can alter CSS elements with object attributes.
(Note that you can also highlight certain modules with the color parameter to print_modules.)
For further details and examples, please see the L<HTML documentation|HTML::Perlinfo::HTML> in the HTML::Perlinfo distribution.
=head1 BUGS
Please report any bugs or feature requests to C<bug-html-perlinfo@rt.cpan.org>, or through the web interface at
L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=HTML-Perlinfo>.
I will be notified, and then you'll automatically be notified of progress on your bug as I make changes.
=head1 NOTES
If you decide to use this module in a CGI script, make sure you print out the content-type header beforehand.
=head1 SEE ALSO
L<HTML::Perlinfo::Loaded>, L<HTML::Perlinfo>, L<perlinfo>, L<Module::Info>, L<Module::CoreList>.
=head1 AUTHOR
Mike Accardo <mikeaccardo@yahoo.com>
=head1 COPYRIGHT
Copyright (c) 2006-8, Mike Accardo. All Rights Reserved.
This module is free software. It may be used, redistributed
and/or modified under the terms of the Perl Artistic License.
=cut

View File

@@ -0,0 +1,209 @@
package HTML::PullParser;
require HTML::Parser;
@ISA=qw(HTML::Parser);
$VERSION = "3.57";
use strict;
use Carp ();
sub new
{
my($class, %cnf) = @_;
# Construct argspecs for the various events
my %argspec;
for (qw(start end text declaration comment process default)) {
my $tmp = delete $cnf{$_};
next unless defined $tmp;
$argspec{$_} = $tmp;
}
Carp::croak("Info not collected for any events")
unless %argspec;
my $file = delete $cnf{file};
my $doc = delete $cnf{doc};
Carp::croak("Can't parse from both 'doc' and 'file' at the same time")
if defined($file) && defined($doc);
Carp::croak("No 'doc' or 'file' given to parse from")
unless defined($file) || defined($doc);
# Create object
$cnf{api_version} = 3;
my $self = $class->SUPER::new(%cnf);
my $accum = $self->{pullparser_accum} = [];
while (my($event, $argspec) = each %argspec) {
$self->SUPER::handler($event => $accum, $argspec);
}
if (defined $doc) {
$self->{pullparser_str_ref} = ref($doc) ? $doc : \$doc;
$self->{pullparser_str_pos} = 0;
}
else {
if (!ref($file) && ref(\$file) ne "GLOB") {
require IO::File;
$file = IO::File->new($file, "r") || return;
}
$self->{pullparser_file} = $file;
}
$self;
}
sub handler
{
Carp::croak("Can't set handlers for HTML::PullParser");
}
sub get_token
{
my $self = shift;
while (!@{$self->{pullparser_accum}} && !$self->{pullparser_eof}) {
if (my $f = $self->{pullparser_file}) {
# must try to parse more from the file
my $buf;
if (read($f, $buf, 512)) {
$self->parse($buf);
} else {
$self->eof;
$self->{pullparser_eof}++;
delete $self->{pullparser_file};
}
}
elsif (my $sref = $self->{pullparser_str_ref}) {
# must try to parse more from the scalar
my $pos = $self->{pullparser_str_pos};
my $chunk = substr($$sref, $pos, 512);
$self->parse($chunk);
$pos += length($chunk);
if ($pos < length($$sref)) {
$self->{pullparser_str_pos} = $pos;
}
else {
$self->eof;
$self->{pullparser_eof}++;
delete $self->{pullparser_str_ref};
delete $self->{pullparser_str_pos};
}
}
else {
die;
}
}
shift @{$self->{pullparser_accum}};
}
sub unget_token
{
my $self = shift;
unshift @{$self->{pullparser_accum}}, @_;
$self;
}
1;
__END__
=head1 NAME
HTML::PullParser - Alternative HTML::Parser interface
=head1 SYNOPSIS
use HTML::PullParser;
$p = HTML::PullParser->new(file => "index.html",
start => 'event, tagname, @attr',
end => 'event, tagname',
ignore_elements => [qw(script style)],
) || die "Can't open: $!";
while (my $token = $p->get_token) {
#...do something with $token
}
=head1 DESCRIPTION
The HTML::PullParser is an alternative interface to the HTML::Parser class.
It basically turns the HTML::Parser inside out. You associate a file
(or any IO::Handle object or string) with the parser at construction time and
then repeatedly call $parser->get_token to obtain the tags and text
found in the parsed document.
The following methods are provided:
=over 4
=item $p = HTML::PullParser->new( file => $file, %options )
=item $p = HTML::PullParser->new( doc => \$doc, %options )
A C<HTML::PullParser> can be made to parse from either a file or a
literal document based on whether the C<file> or C<doc> option is
passed to the parser's constructor.
The C<file> passed in can either be a file name or a file handle
object. If a file name is passed, and it can't be opened for reading,
then the constructor will return an undefined value and $! will tell
you why it failed. Otherwise the argument is taken to be some object
that the C<HTML::PullParser> can read() from when it needs more data.
The stream will be read() until EOF, but not closed.
A C<doc> can be passed plain or as a reference
to a scalar. If a reference is passed then the value of this scalar
should not be changed before all tokens have been extracted.
Next the information to be returned for the different token types must
be set up. This is done by simply associating an argspec (as defined
in L<HTML::Parser>) with the events you have an interest in. For
instance, if you want C<start> tokens to be reported as the string
C<'S'> followed by the tagname and the attributes you might pass an
C<start>-option like this:
$p = HTML::PullParser->new(
doc => $document_to_parse,
start => '"S", tagname, @attr',
end => '"E", tagname',
);
At last other C<HTML::Parser> options, like C<ignore_tags>, and
C<unbroken_text>, can be passed in. Note that you should not use the
I<event>_h options to set up parser handlers. That would confuse the
inner logic of C<HTML::PullParser>.
=item $token = $p->get_token
This method will return the next I<token> found in the HTML document,
or C<undef> at the end of the document. The token is returned as an
array reference. The content of this array match the argspec set up
during C<HTML::PullParser> construction.
=item $p->unget_token( @tokens )
If you find out you have read too many tokens you can push them back,
so that they are returned again the next time $p->get_token is called.
=back
=head1 EXAMPLES
The 'eg/hform' script shows how we might parse the form section of
HTML::Documents using HTML::PullParser.
=head1 SEE ALSO
L<HTML::Parser>, L<HTML::TokeParser>
=head1 COPYRIGHT
Copyright 1998-2001 Gisle Aas.
This library is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.
=cut

View File

@@ -0,0 +1,453 @@
package HTML::SimpleParse;
use strict;
use vars qw($VERSION $FIX_CASE);
$VERSION = '0.12';
my $debug = 0;
sub new {
my $pack = shift;
my $self = bless {
'text' => shift(),
'tree' => [],
@_
}, $pack;
$self->parse() if defined $self->{'text'} and length $self->{'text'};
return $self;
}
sub text {
my $self = shift;
$self->{'text'} = shift if @_;
return $self->{'text'};
}
sub tree { @{$_[0]->{'tree'}} }
sub parse {
# Much of this is a dumbed-down version of HTML::Parser::parse.
my $self = shift;
my $text = \ $self->{'text'};
my $tree = $self->{'tree'};
# Parse html text in $$text. The strategy is to remove complete
# tokens from the beginning of $$text until we can't decide whether
# it is a token or not, or the $$text is empty.
@$tree = ();
while (1) {
my ($content, $type);
# First we try to pull off any plain text (anything before a "<" char)
if ($$text =~ /\G([^<]+)/gcs) {
$content = $1; $type = 'text';
# Then, SSI, comments, and markup declarations (usually <!DOCTYPE...>)
# ssi: <!--#stuff-->
# comment: <!--stuff-->
# markup: <!stuff>
} elsif ($$text =~ /\G<(!--(\#?).*?--)>/gcs) {
$type = ($2 ? 'ssi' : 'comment');
$content = $1;
} elsif ($$text =~ /\G<(!.*?)>/gcs) {
$type = 'markup';
$content = $1;
# Then, look for an end tag
} elsif ($$text =~ m|\G<(/[a-zA-Z][a-zA-Z0-9\.\-]*\s*)>|gcs) {
$content = $1; $type = 'endtag';
# Then, finally we look for a start tag
# We know the first char is <, make sure there's a >
} elsif ($$text =~ /\G<(.*?)>/gcs) {
$content = $1; $type = 'starttag';
} else {
# the string is exhausted, or there's no > in it.
push @$tree, {
'content' => substr($$text, pos $$text),
'type' => 'text',
} unless pos($$text) eq length($$text);
last;
}
push @$tree, {
'content' => $content,
'type' => $type,
'offset' => ($type eq 'text' ?
pos($$text) - length($content) :
pos($$text) - length($content) - 2),
};
}
$self;
}
$FIX_CASE = 1;
sub parse_args {
my $self = shift; # Not needed here
my $str = shift;
my $fix_case = ((ref $self and exists $self->{fix_case}) ? $self->{fix_case} : $FIX_CASE);
my @returns;
# Make sure we start searching at the beginning of the string
pos($str) = 0;
while (1) {
next if $str =~ m/\G\s+/gc; # Get rid of leading whitespace
if ( $str =~ m/\G
([\w.-]+)\s*=\s* # the key
(?:
"([^\"\\]* (?: \\.[^\"\\]* )* )"\s* # quoted string, with possible whitespace inside,
| # or
'([^\'\\]* (?: \\.[^\'\\]* )* )'\s* # quoted string, with possible whitespace inside,
| # or
([^\s>]*)\s* # anything else, without whitespace or >
)/gcx ) {
my ($key, $val) = ($1, $+);
$val =~ s/\\(.)/$1/gs;
push @returns, ($fix_case==1 ? uc($key) : $fix_case==-1 ? lc($key) : $key), $val;
} elsif ( $str =~ m,\G/?([\w.-]+)\s*,gc ) {
push @returns, ($fix_case==1 ? uc($1) : $fix_case==-1 ? lc($1) : $1 ), undef;
} else {
last;
}
}
return @returns;
}
sub execute {
my $self = shift;
my $ref = shift;
my $method = "output_$ref->{type}";
warn "calling $self->$method(...)" if $debug;
return $self->$method($ref->{content});
}
sub get_output {
my $self = shift;
my ($method, $out) = ('', '');
foreach ($self->tree) {
$out .= $self->execute($_);
}
return $out;
}
sub output {
my $self = shift;
my $method;
foreach ($self->tree) {
print $self->execute($_);
}
}
sub output_text { $_[1]; }
sub output_comment { "<$_[1]>"; }
sub output_endtag { "<$_[1]>"; }
sub output_starttag { "<$_[1]>"; }
sub output_markup { "<$_[1]>"; }
sub output_ssi { "<$_[1]>"; }
1;
__END__
=head1 NAME
HTML::SimpleParse - a bare-bones HTML parser
=head1 SYNOPSIS
use HTML::SimpleParse;
# Parse the text into a simple tree
my $p = new HTML::SimpleParse( $html_text );
$p->output; # Output the HTML verbatim
$p->text( $new_text ); # Give it some new HTML to chew on
$p->parse # Parse the new HTML
$p->output;
my %attrs = HTML::SimpleParse->parse_args('A="xx" B=3');
# %attrs is now ('A' => 'xx', 'B' => '3')
=head1 DESCRIPTION
This module is a simple HTML parser. It is similar in concept to HTML::Parser,
but it differs from HTML::TreeBuilder in a couple of important ways.
First, HTML::TreeBuilder knows which tags can contain other tags, which
start tags have corresponding end tags, which tags can exist only in
the <HEAD> portion of the document, and so forth. HTML::SimpleParse
does not know any of these things. It just finds tags and text in the
HTML you give it, it does not care about the specific content of these
tags (though it does distiguish between different _types_ of tags,
such as comments, starting tags like <b>, ending tags like </b>, and
so on).
Second, HTML::SimpleParse does not create a hierarchical tree of HTML content,
but rather a simple linear list. It does not pay any attention to balancing
start tags with corresponding end tags, or which pairs of tags are inside other
pairs of tags.
Because of these characteristics, you can make a very effective HTML
filter by sub-classing HTML::SimpleParse. For example, to remove all comments
from HTML:
package NoComment;
use HTML::SimpleParse;
@ISA = qw(HTML::SimpleParse);
sub output_comment {}
package main;
NoComment->new($some_html)->output;
Historically, I started the HTML::SimpleParse project in part because
of a misunderstanding about HTML::Parser's functionality. Many
aspects of these two modules actually overlap. I continue to maintain
the HTML::SimpleParse module because people seem to be depending on
it, and because beginners sometimes find HTML::SimpleParse to be
simpler than HTML::Parser's more powerful interface. People also seem
to get a fair amount of usage out of the C<parse_args()> method
directly.
=head2 Methods
=over 4
=item * new
$p = new HTML::SimpleParse( $some_html );
Creates a new HTML::SimpleParse object. Optionally takes one argument,
a string containing some HTML with which to initialize the object. If
you give it a non-empty string, the HTML will be parsed into a tree and
ready for outputting.
Can also take a list of attributes, such as
$p = new HTML::SimpleParse( $some_html, 'fix_case' => -1);
See the C<parse_args()> method below for an explanation of this attribute.
=item * text
$text = $p->text;
$p->text( $new_text );
Get or set the contents of the HTML to be parsed.
=item * tree
foreach ($p->tree) { ... }
Returns a list of all the nodes in the tree, in case you want to step
through them manually or something. Each node in the tree is an
anonymous hash with (at least) three data members, $node->{type} (is
this a comment, a start tag, an end tag, etc.), $node->{content} (all
the text between the angle brackets, verbatim), and $node->{offset}
(number of bytes from the beginning of the string).
The possible values of $node->{type} are C<text>, C<starttag>,
C<endtag>, C<ssi>, and C<markup>.
=item * parse
$p->parse;
Once an object has been initialized with some text, call $p->parse and
a tree will be created. After the tree is created, you can call $p->output.
If you feed some text to the new() method, parse will be called automatically
during your object's construction.
=item * parse_args
%hash = $p->parse_args( $arg_string );
This routine is handy for parsing the contents of an HTML tag into key=value
pairs. For instance:
$text = 'type=checkbox checked name=flavor value="chocolate or strawberry"';
%hash = $p->parse_args( $text );
# %hash is ( TYPE=>'checkbox', CHECKED=>undef, NAME=>'flavor',
# VALUE=>'chocolate or strawberry' )
Note that the position of the last m//g search on the string (the value
returned by Perl's pos() function) will be altered by the parse_args function,
so make sure you take that into account if (in the above example) you do
C<$text =~ m/something/g>.
The parse_args() method can be run as either an object method or as a
class method, i.e. as either $p->parse_args(...) or
HTML::SimpleParse->parse_args(...).
HTML attribute lists are supposed to be case-insensitive with respect
to attribute names. To achieve this behavior, parse_args() respects
the 'fix_case' flag, which can be set either as a package global
$FIX_CASE, or as a class member datum 'fix_case'. If set to 0, no
case conversion is done. If set to 1, all keys are converted to upper
case. If set to -1, all keys are converted to lower case. The
default is 1, i.e. all keys are uppercased.
If an attribute takes no value (like "checked" in the above example) then it
will still have an entry in the returned hash, but its value will be C<undef>.
For example:
%hash = $p->parse_args('type=checkbox checked name=banana value=""');
# $hash{CHECKED} is undef, but $hash{VALUE} is ""
This method actually returns a list (not a hash), so duplicate attributes and
order will be preserved if you want them to be:
@hash = $p->parse_args("name=family value=gwen value=mom value=pop");
# @hash is qw(NAME family VALUE gwen VALUE mom VALUE pop)
=item * output
$p->output;
This will output the contents of the HTML, passing the real work off to
the output_text, output_comment, etc. functions. If you do not override any
of these methods, this module will output the exact text that it parsed into
a tree in the first place.
=item * get_output
print $p->get_output
Similar to $p->output(), but returns its result instead of printing it.
=item * execute
foreach ($p->tree) {
print $p->execute($_);
}
Executes a single node in the HTML parse tree. Useful if you want to loop
through the nodes and output them individually.
=back
The following methods do the actual outputting of the various parts of
the HTML. Override some of them if you want to change the way the HTML
is output. For instance, to strip comments from the HTML, override the
output_comment method like so:
# In subclass:
sub output_comment { } # Does nothing
=over 4
=item * output_text
=item * output_comment
=item * output_endtag
=item * output_starttag
=item * output_markup
=item * output_ssi
=back
=head1 CAVEATS
Please do not assume that the interface here is stable. This is a first pass,
and I'm still trying to incorporate suggestions from the community. If you
employ this module somewhere, make doubly sure before upgrading that none of your
code breaks when you use the newer version.
=head1 BUGS
=over 4
=item * Embedded >s are broken
Won't handle tags with embedded >s in them, like
<input name=expr value="x > y">. This will be fixed in a future
version, probably by using the parse_args method. Suggestions are welcome.
=back
=head1 TO DO
=over 4
=item * extensibility
Based on a suggestion from Randy Harmon (thanks), I'd like to make it easier
for subclasses of SimpleParse to pick out other kinds of HTML blocks, i.e.
extend the set {text, comment, endtag, starttag, markup, ssi} to include more
members. Currently the only easy way to do that is by overriding the
C<parse> method:
sub parse { # In subclass
my $self = $_[0];
$self->SUPER::parse(@_);
foreach ($self->tree) {
if ($_->{content} =~ m#^a\s+#i) {
$_->{type} = 'anchor_start';
}
}
}
sub output_anchor_start {
# Whatever you want...
}
Alternatively, this feature might be implemented by hanging attatchments
onto the parsing loop, like this:
my $parser = new SimpleParse( $html_text );
$regex = '<(a\s+.*?)>';
$parser->watch_for( 'anchor_start', $regex );
sub SimpleParse::output_anchor_start {
# Whatever you want...
}
I think I like that idea better. If you wanted to, you could make a subclass
with output_anchor_start as one of its methods, and put the ->watch_for
stuff in the constructor.
=item * reading from filehandles
It would be nice if you could initialize an object by giving it a filehandle
or filename instead of the text itself.
=item * tests
I need to write a few tests that run under "make test".
=back
=head1 AUTHOR
Ken Williams <ken@forum.swarthmore.edu>
=head1 COPYRIGHT
Copyright 1998 Swarthmore College. All rights reserved.
This library is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.
=cut

View File

@@ -0,0 +1,471 @@
package HTML::Tagset;
use strict;
=head1 NAME
HTML::Tagset - data tables useful in parsing HTML
=head1 VERSION
Version 3.20
=cut
use vars qw( $VERSION );
$VERSION = '3.20';
=head1 SYNOPSIS
use HTML::Tagset;
# Then use any of the items in the HTML::Tagset package
# as need arises
=head1 DESCRIPTION
This module contains several data tables useful in various kinds of
HTML parsing operations.
Note that all tag names used are lowercase.
In the following documentation, a "hashset" is a hash being used as a
set -- the hash conveys that its keys are there, and the actual values
associated with the keys are not significant. (But what values are
there, are always true.)
=cut
use vars qw(
$VERSION
%emptyElement %optionalEndTag %linkElements %boolean_attr
%isHeadElement %isBodyElement %isPhraseMarkup
%is_Possible_Strict_P_Content
%isHeadOrBodyElement
%isList %isTableElement %isFormElement
%isKnown %canTighten
@p_closure_barriers
%isCDATA_Parent
);
=head1 VARIABLES
Note that none of these variables are exported.
=head2 hashset %HTML::Tagset::emptyElement
This hashset has as values the tag-names (GIs) of elements that cannot
have content. (For example, "base", "br", "hr".) So
C<$HTML::Tagset::emptyElement{'hr'}> exists and is true.
C<$HTML::Tagset::emptyElement{'dl'}> does not exist, and so is not true.
=cut
%emptyElement = map {; $_ => 1 } qw(base link meta isindex
img br hr wbr
input area param
embed bgsound spacer
basefont col frame
~comment ~literal
~declaration ~pi
);
# The "~"-initial names are for pseudo-elements used by HTML::Entities
# and TreeBuilder
=head2 hashset %HTML::Tagset::optionalEndTag
This hashset lists tag-names for elements that can have content, but whose
end-tags are generally, "safely", omissible. Example:
C<$HTML::Tagset::emptyElement{'li'}> exists and is true.
=cut
%optionalEndTag = map {; $_ => 1 } qw(p li dt dd); # option th tr td);
=head2 hash %HTML::Tagset::linkElements
Values in this hash are tagnames for elements that might contain
links, and the value for each is a reference to an array of the names
of attributes whose values can be links.
=cut
%linkElements =
(
'a' => ['href'],
'applet' => ['archive', 'codebase', 'code'],
'area' => ['href'],
'base' => ['href'],
'bgsound' => ['src'],
'blockquote' => ['cite'],
'body' => ['background'],
'del' => ['cite'],
'embed' => ['pluginspage', 'src'],
'form' => ['action'],
'frame' => ['src', 'longdesc'],
'iframe' => ['src', 'longdesc'],
'ilayer' => ['background'],
'img' => ['src', 'lowsrc', 'longdesc', 'usemap'],
'input' => ['src', 'usemap'],
'ins' => ['cite'],
'isindex' => ['action'],
'head' => ['profile'],
'layer' => ['background', 'src'],
'link' => ['href'],
'object' => ['classid', 'codebase', 'data', 'archive', 'usemap'],
'q' => ['cite'],
'script' => ['src', 'for'],
'table' => ['background'],
'td' => ['background'],
'th' => ['background'],
'tr' => ['background'],
'xmp' => ['href'],
);
=head2 hash %HTML::Tagset::boolean_attr
This hash (not hashset) lists what attributes of what elements can be
printed without showing the value (for example, the "noshade" attribute
of "hr" elements). For elements with only one such attribute, its value
is simply that attribute name. For elements with many such attributes,
the value is a reference to a hashset containing all such attributes.
=cut
%boolean_attr = (
# TODO: make these all hashes
'area' => 'nohref',
'dir' => 'compact',
'dl' => 'compact',
'hr' => 'noshade',
'img' => 'ismap',
'input' => { 'checked' => 1, 'readonly' => 1, 'disabled' => 1 },
'menu' => 'compact',
'ol' => 'compact',
'option' => 'selected',
'select' => 'multiple',
'td' => 'nowrap',
'th' => 'nowrap',
'ul' => 'compact',
);
#==========================================================================
# List of all elements from Extensible HTML version 1.0 Transitional DTD:
#
# a abbr acronym address applet area b base basefont bdo big
# blockquote body br button caption center cite code col colgroup
# dd del dfn dir div dl dt em fieldset font form h1 h2 h3 h4 h5 h6
# head hr html i iframe img input ins isindex kbd label legend li
# link map menu meta noframes noscript object ol optgroup option p
# param pre q s samp script select small span strike strong style
# sub sup table tbody td textarea tfoot th thead title tr tt u ul
# var
#
# Varia from Mozilla source internal table of tags:
# Implemented:
# xmp listing wbr nobr frame frameset noframes ilayer
# layer nolayer spacer embed multicol
# But these are unimplemented:
# sound?? keygen?? server??
# Also seen here and there:
# marquee?? app?? (both unimplemented)
#==========================================================================
=head2 hashset %HTML::Tagset::isPhraseMarkup
This hashset contains all phrasal-level elements.
=cut
%isPhraseMarkup = map {; $_ => 1 } qw(
span abbr acronym q sub sup
cite code em kbd samp strong var dfn strike
b i u s tt small big
a img br
wbr nobr blink
font basefont bdo
spacer embed noembed
); # had: center, hr, table
=head2 hashset %HTML::Tagset::is_Possible_Strict_P_Content
This hashset contains all phrasal-level elements that be content of a
P element, for a strict model of HTML.
=cut
%is_Possible_Strict_P_Content = (
%isPhraseMarkup,
%isFormElement,
map {; $_ => 1} qw( object script map )
# I've no idea why there's these latter exceptions.
# I'm just following the HTML4.01 DTD.
);
#from html4 strict:
#<!ENTITY % fontstyle "TT | I | B | BIG | SMALL">
#
#<!ENTITY % phrase "EM | STRONG | DFN | CODE |
# SAMP | KBD | VAR | CITE | ABBR | ACRONYM" >
#
#<!ENTITY % special
# "A | IMG | OBJECT | BR | SCRIPT | MAP | Q | SUB | SUP | SPAN | BDO">
#
#<!ENTITY % formctrl "INPUT | SELECT | TEXTAREA | LABEL | BUTTON">
#
#<!-- %inline; covers inline or "text-level" elements -->
#<!ENTITY % inline "#PCDATA | %fontstyle; | %phrase; | %special; | %formctrl;">
=head2 hashset %HTML::Tagset::isHeadElement
This hashset contains all elements that elements that should be
present only in the 'head' element of an HTML document.
=cut
%isHeadElement = map {; $_ => 1 }
qw(title base link meta isindex script style object bgsound);
=head2 hashset %HTML::Tagset::isList
This hashset contains all elements that can contain "li" elements.
=cut
%isList = map {; $_ => 1 } qw(ul ol dir menu);
=head2 hashset %HTML::Tagset::isTableElement
This hashset contains all elements that are to be found only in/under
a "table" element.
=cut
%isTableElement = map {; $_ => 1 }
qw(tr td th thead tbody tfoot caption col colgroup);
=head2 hashset %HTML::Tagset::isFormElement
This hashset contains all elements that are to be found only in/under
a "form" element.
=cut
%isFormElement = map {; $_ => 1 }
qw(input select option optgroup textarea button label);
=head2 hashset %HTML::Tagset::isBodyMarkup
This hashset contains all elements that are to be found only in/under
the "body" element of an HTML document.
=cut
%isBodyElement = map {; $_ => 1 } qw(
h1 h2 h3 h4 h5 h6
p div pre plaintext address blockquote
xmp listing
center
multicol
iframe ilayer nolayer
bgsound
hr
ol ul dir menu li
dl dt dd
ins del
fieldset legend
map area
applet param object
isindex script noscript
table
center
form
),
keys %isFormElement,
keys %isPhraseMarkup, # And everything phrasal
keys %isTableElement,
;
=head2 hashset %HTML::Tagset::isHeadOrBodyElement
This hashset includes all elements that I notice can fall either in
the head or in the body.
=cut
%isHeadOrBodyElement = map {; $_ => 1 }
qw(script isindex style object map area param noscript bgsound);
# i.e., if we find 'script' in the 'body' or the 'head', don't freak out.
=head2 hashset %HTML::Tagset::isKnown
This hashset lists all known HTML elements.
=cut
%isKnown = (%isHeadElement, %isBodyElement,
map{; $_=>1 }
qw( head body html
frame frameset noframes
~comment ~pi ~directive ~literal
));
# that should be all known tags ever ever
=head2 hashset %HTML::Tagset::canTighten
This hashset lists elements that might have ignorable whitespace as
children or siblings.
=cut
%canTighten = %isKnown;
delete @canTighten{
keys(%isPhraseMarkup), 'input', 'select',
'xmp', 'listing', 'plaintext', 'pre',
};
# xmp, listing, plaintext, and pre are untightenable, and
# in a really special way.
@canTighten{'hr','br'} = (1,1);
# exceptional 'phrasal' things that ARE subject to tightening.
# The one case where I can think of my tightening rules failing is:
# <p>foo bar<center> <em>baz quux</em> ...
# ^-- that would get deleted.
# But that's pretty gruesome code anyhow. You gets what you pays for.
#==========================================================================
=head2 array @HTML::Tagset::p_closure_barriers
This array has a meaning that I have only seen a need for in
C<HTML::TreeBuilder>, but I include it here on the off chance that someone
might find it of use:
When we see a "E<lt>pE<gt>" token, we go lookup up the lineage for a p
element we might have to minimize. At first sight, we might say that
if there's a p anywhere in the lineage of this new p, it should be
closed. But that's wrong. Consider this document:
<html>
<head>
<title>foo</title>
</head>
<body>
<p>foo
<table>
<tr>
<td>
foo
<p>bar
</td>
</tr>
</table>
</p>
</body>
</html>
The second p is quite legally inside a much higher p.
My formalization of the reason why this is legal, but this:
<p>foo<p>bar</p></p>
isn't, is that something about the table constitutes a "barrier" to
the application of the rule about what p must minimize.
So C<@HTML::Tagset::p_closure_barriers> is the list of all such
barrier-tags.
=cut
@p_closure_barriers = qw(
li blockquote
ul ol menu dir
dl dt dd
td th tr table caption
div
);
# In an ideal world (i.e., XHTML) we wouldn't have to bother with any of this
# monkey business of barriers to minimization!
=head2 hashset %isCDATA_Parent
This hashset includes all elements whose content is CDATA.
=cut
%isCDATA_Parent = map {; $_ => 1 }
qw(script style xmp listing plaintext);
# TODO: there's nothing else that takes CDATA children, right?
# As the HTML3 DTD (Raggett 1995-04-24) noted:
# The XMP, LISTING and PLAINTEXT tags are incompatible with SGML
# and derive from very early versions of HTML. They require non-
# standard parsers and will cause problems for processing
# documents with standard SGML tools.
=head1 CAVEATS
You may find it useful to alter the behavior of modules (like
C<HTML::Element> or C<HTML::TreeBuilder>) that use C<HTML::Tagset>'s
data tables by altering the data tables themselves. You are welcome
to try, but be careful; and be aware that different modules may or may
react differently to the data tables being changed.
Note that it may be inappropriate to use these tables for I<producing>
HTML -- for example, C<%isHeadOrBodyElement> lists the tagnames
for all elements that can appear either in the head or in the body,
such as "script". That doesn't mean that I am saying your code that
produces HTML should feel free to put script elements in either place!
If you are producing programs that spit out HTML, you should be
I<intimately> familiar with the DTDs for HTML or XHTML (available at
C<http://www.w3.org/>), and you should slavishly obey them, not
the data tables in this document.
=head1 SEE ALSO
L<HTML::Element>, L<HTML::TreeBuilder>, L<HTML::LinkExtor>
=head1 COPYRIGHT & LICENSE
Copyright 1995-2000 Gisle Aas.
Copyright 2000-2005 Sean M. Burke.
Copyright 2005-2008 Andy Lester.
This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.
=head1 ACKNOWLEDGEMENTS
Most of the code/data in this module was adapted from code written
by Gisle Aas for C<HTML::Element>, C<HTML::TreeBuilder>, and
C<HTML::LinkExtor>. Then it was maintained by Sean M. Burke.
=head1 AUTHOR
Current maintainer: Andy Lester, C<< <andy at petdance.com> >>
=head1 BUGS
Please report any bugs or feature requests to
C<bug-html-tagset at rt.cpan.org>, or through the web interface at
L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=HTML-Tagset>. I will
be notified, and then you'll automatically be notified of progress on
your bug as I make changes.
=cut
1;

View File

@@ -0,0 +1,369 @@
package HTML::TokeParser;
require HTML::PullParser;
@ISA=qw(HTML::PullParser);
$VERSION = "3.57";
use strict;
use Carp ();
use HTML::Entities qw(decode_entities);
use HTML::Tagset ();
my %ARGS =
(
start => "'S',tagname,attr,attrseq,text",
end => "'E',tagname,text",
text => "'T',text,is_cdata",
process => "'PI',token0,text",
comment => "'C',text",
declaration => "'D',text",
# options that default on
unbroken_text => 1,
);
sub new
{
my $class = shift;
my %cnf;
if (@_ == 1) {
my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file";
%cnf = ($type => $_[0]);
}
else {
%cnf = @_;
}
my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"};
my $self = $class->SUPER::new(%cnf, %ARGS) || return undef;
$self->{textify} = $textify;
$self;
}
sub get_tag
{
my $self = shift;
my $token;
while (1) {
$token = $self->get_token || return undef;
my $type = shift @$token;
next unless $type eq "S" || $type eq "E";
substr($token->[0], 0, 0) = "/" if $type eq "E";
return $token unless @_;
for (@_) {
return $token if $token->[0] eq $_;
}
}
}
sub _textify {
my($self, $token) = @_;
my $tag = $token->[1];
return undef unless exists $self->{textify}{$tag};
my $alt = $self->{textify}{$tag};
my $text;
if (ref($alt)) {
$text = &$alt(@$token);
} else {
$text = $token->[2]{$alt || "alt"};
$text = "[\U$tag]" unless defined $text;
}
return $text;
}
sub get_text
{
my $self = shift;
my @text;
while (my $token = $self->get_token) {
my $type = $token->[0];
if ($type eq "T") {
my $text = $token->[1];
decode_entities($text) unless $token->[2];
push(@text, $text);
} elsif ($type =~ /^[SE]$/) {
my $tag = $token->[1];
if ($type eq "S") {
if (defined(my $text = _textify($self, $token))) {
push(@text, $text);
next;
}
} else {
$tag = "/$tag";
}
if (!@_ || grep $_ eq $tag, @_) {
$self->unget_token($token);
last;
}
push(@text, " ")
if $tag eq "br" || !$HTML::Tagset::isPhraseMarkup{$token->[1]};
}
}
join("", @text);
}
sub get_trimmed_text
{
my $self = shift;
my $text = $self->get_text(@_);
$text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
$text;
}
sub get_phrase {
my $self = shift;
my @text;
while (my $token = $self->get_token) {
my $type = $token->[0];
if ($type eq "T") {
my $text = $token->[1];
decode_entities($text) unless $token->[2];
push(@text, $text);
} elsif ($type =~ /^[SE]$/) {
my $tag = $token->[1];
if ($type eq "S") {
if (defined(my $text = _textify($self, $token))) {
push(@text, $text);
next;
}
}
if (!$HTML::Tagset::isPhraseMarkup{$tag}) {
$self->unget_token($token);
last;
}
push(@text, " ") if $tag eq "br";
}
}
my $text = join("", @text);
$text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
$text;
}
1;
__END__
=head1 NAME
HTML::TokeParser - Alternative HTML::Parser interface
=head1 SYNOPSIS
require HTML::TokeParser;
$p = HTML::TokeParser->new("index.html") ||
die "Can't open: $!";
$p->empty_element_tags(1); # configure its behaviour
while (my $token = $p->get_token) {
#...
}
=head1 DESCRIPTION
The C<HTML::TokeParser> is an alternative interface to the
C<HTML::Parser> class. It is an C<HTML::PullParser> subclass with a
predeclared set of token types. If you wish the tokens to be reported
differently you probably want to use the C<HTML::PullParser> directly.
The following methods are available:
=over 4
=item $p = HTML::TokeParser->new( $filename, %opt );
=item $p = HTML::TokeParser->new( $filehandle, %opt );
=item $p = HTML::TokeParser->new( \$document, %opt );
The object constructor argument is either a file name, a file handle
object, or the complete document to be parsed. Extra options can be
provided as key/value pairs and are processed as documented by the base
classes.
If the argument is a plain scalar, then it is taken as the name of a
file to be opened and parsed. If the file can't be opened for
reading, then the constructor will return C<undef> and $! will tell
you why it failed.
If the argument is a reference to a plain scalar, then this scalar is
taken to be the literal document to parse. The value of this
scalar should not be changed before all tokens have been extracted.
Otherwise the argument is taken to be some object that the
C<HTML::TokeParser> can read() from when it needs more data. Typically
it will be a filehandle of some kind. The stream will be read() until
EOF, but not closed.
A newly constructed C<HTML::TokeParser> differ from its base classes
by having the C<unbroken_text> attribute enabled by default. See
L<HTML::Parser> for a description of this and other attributes that
influence how the document is parsed. It is often a good idea to enable
C<empty_element_tags> behaviour.
Note that the parsing result will likely not be valid if raw undecoded
UTF-8 is used as a source. When parsing UTF-8 encoded files turn
on UTF-8 decoding:
open(my $fh, "<:utf8", "index.html") || die "Can't open 'index.html': $!";
my $p = HTML::TokeParser->new( $fh );
# ...
If a $filename is passed to the constructor the file will be opened in
raw mode and the parsing result will only be valid if its content is
Latin-1 or pure ASCII.
If parsing from an UTF-8 encoded string buffer decode it first:
utf8::decode($document);
my $p = HTML::TokeParser->new( \$document );
# ...
=item $p->get_token
This method will return the next I<token> found in the HTML document,
or C<undef> at the end of the document. The token is returned as an
array reference. The first element of the array will be a string
denoting the type of this token: "S" for start tag, "E" for end tag,
"T" for text, "C" for comment, "D" for declaration, and "PI" for
process instructions. The rest of the token array depend on the type
like this:
["S", $tag, $attr, $attrseq, $text]
["E", $tag, $text]
["T", $text, $is_data]
["C", $text]
["D", $text]
["PI", $token0, $text]
where $attr is a hash reference, $attrseq is an array reference and
the rest are plain scalars. The L<HTML::Parser/Argspec> explains the
details.
=item $p->unget_token( @tokens )
If you find you have read too many tokens you can push them back,
so that they are returned the next time $p->get_token is called.
=item $p->get_tag
=item $p->get_tag( @tags )
This method returns the next start or end tag (skipping any other
tokens), or C<undef> if there are no more tags in the document. If
one or more arguments are given, then we skip tokens until one of the
specified tag types is found. For example:
$p->get_tag("font", "/font");
will find the next start or end tag for a font-element.
The tag information is returned as an array reference in the same form
as for $p->get_token above, but the type code (first element) is
missing. A start tag will be returned like this:
[$tag, $attr, $attrseq, $text]
The tagname of end tags are prefixed with "/", i.e. end tag is
returned like this:
["/$tag", $text]
=item $p->get_text
=item $p->get_text( @endtags )
This method returns all text found at the current position. It will
return a zero length string if the next token is not text. Any
entities will be converted to their corresponding character.
If one or more arguments are given, then we return all text occurring
before the first of the specified tags found. For example:
$p->get_text("p", "br");
will return the text up to either a paragraph of linebreak element.
The text might span tags that should be I<textified>. This is
controlled by the $p->{textify} attribute, which is a hash that
defines how certain tags can be treated as text. If the name of a
start tag matches a key in this hash then this tag is converted to
text. The hash value is used to specify which tag attribute to obtain
the text from. If this tag attribute is missing, then the upper case
name of the tag enclosed in brackets is returned, e.g. "[IMG]". The
hash value can also be a subroutine reference. In this case the
routine is called with the start tag token content as its argument and
the return value is treated as the text.
The default $p->{textify} value is:
{img => "alt", applet => "alt"}
This means that <IMG> and <APPLET> tags are treated as text, and that
the text to substitute can be found in the ALT attribute.
=item $p->get_trimmed_text
=item $p->get_trimmed_text( @endtags )
Same as $p->get_text above, but will collapse any sequences of white
space to a single space character. Leading and trailing white space is
removed.
=item $p->get_phrase
This will return all text found at the current position ignoring any
phrasal-level tags. Text is extracted until the first non
phrasal-level tag. Textification of tags is the same as for
get_text(). This method will collapse white space in the same way as
get_trimmed_text() does.
The definition of <i>phrasal-level tags</i> is obtained from the
HTML::Tagset module.
=back
=head1 EXAMPLES
This example extracts all links from a document. It will print one
line for each link, containing the URL and the textual description
between the <A>...</A> tags:
use HTML::TokeParser;
$p = HTML::TokeParser->new(shift||"index.html");
while (my $token = $p->get_tag("a")) {
my $url = $token->[1]{href} || "-";
my $text = $p->get_trimmed_text("/a");
print "$url\t$text\n";
}
This example extract the <TITLE> from the document:
use HTML::TokeParser;
$p = HTML::TokeParser->new(shift||"index.html");
if ($p->get_tag("title")) {
my $title = $p->get_trimmed_text;
print "Title: $title\n";
}
=head1 SEE ALSO
L<HTML::PullParser>, L<HTML::Parser>
=head1 COPYRIGHT
Copyright 1998-2005 Gisle Aas.
This library is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.
=cut