Initial Commit
This commit is contained in:
690
database/perl/lib/HTML/Clean.pm
Normal file
690
database/perl/lib/HTML/Clean.pm
Normal file
@@ -0,0 +1,690 @@
|
||||
package HTML::Clean;
|
||||
|
||||
use Carp;
|
||||
use IO;
|
||||
use Fcntl;
|
||||
use strict;
|
||||
require 5.004;
|
||||
|
||||
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
|
||||
|
||||
require Exporter;
|
||||
require AutoLoader;
|
||||
|
||||
# Items to export to callers namespace
|
||||
@EXPORT = qw();
|
||||
|
||||
$VERSION = '0.8';
|
||||
|
||||
=head1 NAME
|
||||
|
||||
HTML::Clean - Cleans up HTML code for web browsers, not humans
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use HTML::Clean;
|
||||
$h = new HTML::Clean($filename); # or..
|
||||
$h = new HTML::Clean($htmlcode);
|
||||
|
||||
$h->compat();
|
||||
$h->strip();
|
||||
$data = $h->data();
|
||||
print $$data;
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
The HTML::Clean module encapsulates a number of common techniques for
|
||||
minimizing the size of HTML files. You can typically save between
|
||||
10% and 50% of the size of a HTML file using these methods.
|
||||
It provides the following features:
|
||||
|
||||
=over 8
|
||||
|
||||
=item Remove unneeded whitespace (begining of line, etc)
|
||||
|
||||
=item Remove unneeded META elements.
|
||||
|
||||
=item Remove HTML comments (except for styles, javascript and SSI)
|
||||
|
||||
=item Replace tags with equivilant shorter tags (<strong> --> <b>)
|
||||
|
||||
=item etc.
|
||||
|
||||
=back
|
||||
|
||||
The entire proces is configurable, so you can pick and choose what you want
|
||||
to clean.
|
||||
|
||||
=head1 THE HTML::Clean CLASS
|
||||
|
||||
=over 4
|
||||
|
||||
=cut
|
||||
|
||||
|
||||
######################################################################
|
||||
|
||||
=head2 $h = new HTML::Clean($dataorfile, [$level]);
|
||||
|
||||
This creates a new HTML::Clean object. A Prerequisite for all other
|
||||
functions in this module.
|
||||
|
||||
The $dataorfile parameter supplies the input HTML, either a filename,
|
||||
or a reference to a scalar value holding the HTML, for example:
|
||||
|
||||
$h = new HTML::Clean("/htdocs/index.html");
|
||||
$html = "<strong>Hello!</strong>";
|
||||
$h = new HTML::Clean(\$html);
|
||||
|
||||
An optional 'level' parameter controls the level of optimization
|
||||
performed. Levels range from 1 to 9. Level 1 includes only simple
|
||||
fast optimizations. Level 9 includes all optimizations.
|
||||
|
||||
=cut
|
||||
|
||||
sub new {
|
||||
my $this = shift;
|
||||
my $class = ref($this) || $this;
|
||||
my $self = {};
|
||||
bless $self, $class;
|
||||
|
||||
my $data = shift;
|
||||
my $level = shift;
|
||||
|
||||
if ($self->initialize($data)) {
|
||||
# set the default level
|
||||
$level = 9 if (!$level);
|
||||
$self->level($level);
|
||||
return $self;
|
||||
} else {
|
||||
undef $self;
|
||||
return undef;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#
|
||||
# Set up the data in the self hash..
|
||||
#
|
||||
|
||||
=head2 $h->initialize($dataorfile)
|
||||
|
||||
This function allows you to reinitialize the HTML data used by the
|
||||
current object. This is useful if you are processing many files.
|
||||
|
||||
$dataorfile has the same usage as the new method.
|
||||
|
||||
Return 0 for an error, 1 for success.
|
||||
|
||||
=cut
|
||||
|
||||
sub initialize {
|
||||
my($self, $data) = @_;
|
||||
$self->{'DATA'} = undef;
|
||||
|
||||
# Not defined? Just return true.
|
||||
return(1) if (!$data);
|
||||
|
||||
# Check if it's a ref
|
||||
if (ref($data)) {
|
||||
$self->{DATA} = $data;
|
||||
return(1);
|
||||
}
|
||||
|
||||
# Newline char, really an error, but just go with it..
|
||||
if ($data =~ /\n/) {
|
||||
$self->{'DATA'} = \$data;
|
||||
}
|
||||
|
||||
# No newline? Must be a filename
|
||||
if (-f $data) {
|
||||
my $storage;
|
||||
|
||||
sysopen(IN, "$data", O_RDONLY) || return(0);
|
||||
while (<IN>) {
|
||||
$storage .= $_;
|
||||
}
|
||||
close(IN);
|
||||
$self->{'DATA'} = \$storage;
|
||||
return(1);
|
||||
}
|
||||
|
||||
return(0); # file not found?
|
||||
}
|
||||
|
||||
|
||||
=head2 $h->level([$level])
|
||||
|
||||
Get/set the optimization level. $level is a number from 1 to 9.
|
||||
|
||||
=cut
|
||||
|
||||
sub level {
|
||||
my($self, $level) = @_;
|
||||
|
||||
if (defined($level) && ($level > 0) && ($level < 10)) {
|
||||
$self->{'LEVEL'} = $level
|
||||
}
|
||||
return($self->{'LEVEL'});
|
||||
}
|
||||
|
||||
=head2 $myref = $h->data()
|
||||
|
||||
Returns the current HTML data as a scalar reference.
|
||||
|
||||
=cut
|
||||
|
||||
sub data {
|
||||
my($self) = @_;
|
||||
|
||||
return $self->{'DATA'};
|
||||
}
|
||||
|
||||
|
||||
# Junk HTML comments (INTERNAL)
|
||||
|
||||
sub _commentcheck($) {
|
||||
my($comment) = @_;
|
||||
|
||||
$_ = $comment;
|
||||
|
||||
# Server side include
|
||||
return($comment) if (m,^<!--\#,si);
|
||||
|
||||
# ITU Hack.. preserve some frontpage components
|
||||
return($comment) if (m,^<!-- %,si);
|
||||
return($comment) if (m,bot="(SaveResults|Search|ConfirmationField)",si);
|
||||
|
||||
# Javascript
|
||||
return($comment) if (m,//.*-->$,si);
|
||||
return($comment) if (m,navigator\.app(name|version),si);
|
||||
|
||||
# Stylesheet
|
||||
return($comment) if (m,[A-z0-9]+\:[A-z0-9]+\s*\{.*\},si);
|
||||
return('');
|
||||
}
|
||||
|
||||
|
||||
# Remove javascript comments (INTERNAL)
|
||||
|
||||
sub _jscomments {
|
||||
my($js) = @_;
|
||||
|
||||
$js =~ s,\n\s*//.*?\n,\n,sig;
|
||||
$js =~ s,\s+//.*?\n,\n,sig;
|
||||
|
||||
# insure javascript is hidden
|
||||
|
||||
if ($js =~ m,<!--,) {
|
||||
$js =~ s,</script>,// -->\n</script>,si;
|
||||
}
|
||||
return($js);
|
||||
}
|
||||
|
||||
# Clean up other javascript stuff..
|
||||
|
||||
sub _javascript {
|
||||
my($js) = @_;
|
||||
|
||||
# remove excess whitespace at the beginning and end of lines
|
||||
$js =~ s,\s*\n+\s*,\n,sig;
|
||||
|
||||
# braces/semicolon at end of line, join next line
|
||||
$js =~ s,([;{}])\n,$1,sig;
|
||||
|
||||
# What else is safe to do?
|
||||
|
||||
return($js);
|
||||
}
|
||||
|
||||
# replace #000000 -> black, etc..
|
||||
# Does the browser render faster with RGB? You would think so..
|
||||
|
||||
sub _defcolorcheck ($) {
|
||||
my($c) = @_;
|
||||
|
||||
$c =~ s/\#000000/black/;
|
||||
$c =~ s/\#c0c0c0/silver/i;
|
||||
$c =~ s/\#808080/gray/;
|
||||
$c =~ s/\#ffffff/white/i;
|
||||
$c =~ s/\#800000/maroon/;
|
||||
$c =~ s/\#ff0000/red/i;
|
||||
$c =~ s/\#800080/purple/;
|
||||
$c =~ s/\#ff00ff/fuchsia/i;
|
||||
$c =~ s/\#ff00ff/fuchsia/i;
|
||||
$c =~ s/\#008000/green/;
|
||||
$c =~ s/\#00ff00/lime/i;
|
||||
$c =~ s/\#808000/olive/;
|
||||
$c =~ s/\#ffff00/yellow/i;
|
||||
$c =~ s/\#000080/navy/;
|
||||
$c =~ s/\#0000ff/blue/i;
|
||||
$c =~ s/\#008080/teal/i;
|
||||
$c =~ s/\#00ffff/aqua/i;
|
||||
return($c);
|
||||
}
|
||||
|
||||
# For replacing entities with numerics
|
||||
use vars qw/ %_ENTITIES/;
|
||||
%_ENTITIES = (
|
||||
'Agrave' => 192,
|
||||
'Aacute' => 193,
|
||||
'Acirc' => 194,
|
||||
'Atilde' => 195,
|
||||
'Auml' => 196,
|
||||
'Aring' => 197,
|
||||
'AElig' => 198,
|
||||
'Ccedil' => 199,
|
||||
'Egrave' => 200,
|
||||
'Eacute' => 201,
|
||||
'Ecirc' => 202,
|
||||
'Euml' => 203,
|
||||
'Igrave' => 204,
|
||||
'Iacute' => 205,
|
||||
'Icirc' => 206,
|
||||
'Iuml' => 207,
|
||||
'ETH' => 208,
|
||||
'Ntilde' => 209,
|
||||
'Ograve' => 210,
|
||||
'Oacute' => 211,
|
||||
'Ocirc' => 212,
|
||||
'Otilde' => 213,
|
||||
'Ouml' => 214,
|
||||
'Oslash' => 216,
|
||||
'Ugrave' => 217,
|
||||
'Uacute' => 218,
|
||||
'Ucirc' => 219,
|
||||
'Uuml' => 220,
|
||||
'Yacute' => 221,
|
||||
'THORN' => 222,
|
||||
'szlig' => 223,
|
||||
'agrave' => 224,
|
||||
'aacute' => 225,
|
||||
'acirc' => 226,
|
||||
'atilde' => 227,
|
||||
'auml' => 228,
|
||||
'aring' => 229,
|
||||
'aelig' => 230,
|
||||
'ccedil' => 231,
|
||||
'egrave' => 232,
|
||||
'eacute' => 233,
|
||||
'ecirc' => 234,
|
||||
'euml' => 235,
|
||||
'igrave' => 236,
|
||||
'iacute' => 237,
|
||||
'icirc' => 238,
|
||||
'iuml' => 239,
|
||||
'eth' => 240,
|
||||
'ntilde' => 241,
|
||||
'ograve' => 242,
|
||||
'oacute' => 243,
|
||||
'ocirc' => 244,
|
||||
'otilde' => 245,
|
||||
'ouml' => 246,
|
||||
'oslash' => 248,
|
||||
'ugrave' => 249,
|
||||
'uacute' => 250,
|
||||
'ucirc' => 251,
|
||||
'uuml' => 252,
|
||||
'yacute' => 253,
|
||||
'thorn' => 254,
|
||||
'yuml' => 255
|
||||
);
|
||||
|
||||
=head2 strip(\%options);
|
||||
|
||||
Removes excess space from HTML
|
||||
|
||||
You can control the optimizations used by specifying them in the
|
||||
%options hash reference.
|
||||
|
||||
The following options are recognized:
|
||||
|
||||
=over 8
|
||||
|
||||
=item boolean values (0 or 1 values)
|
||||
|
||||
whitespace Remove excess whitespace
|
||||
shortertags <strong> -> <b>, etc..
|
||||
blink No blink tags.
|
||||
contenttype Remove default contenttype.
|
||||
comments Remove excess comments.
|
||||
entities " -> ", etc.
|
||||
dequote remove quotes from tag parameters where possible.
|
||||
defcolor recode colors in shorter form. (#ffffff -> white, etc.)
|
||||
javascript remove excess spaces and newlines in javascript code.
|
||||
htmldefaults remove default values for some html tags
|
||||
lowercasetags translate all HTML tags to lowercase
|
||||
|
||||
=item parameterized values
|
||||
|
||||
meta Takes a space separated list of meta tags to remove,
|
||||
default "GENERATOR FORMATTER"
|
||||
|
||||
emptytags Takes a space separated list of tags to remove when there is no
|
||||
content between the start and end tag, like this: <b></b>.
|
||||
The default is 'b i font center'
|
||||
|
||||
=back
|
||||
|
||||
=cut
|
||||
|
||||
use vars qw/
|
||||
$do_whitespace
|
||||
$do_shortertags
|
||||
$do_meta
|
||||
$do_blink
|
||||
$do_contenttype
|
||||
$do_comments
|
||||
$do_entities
|
||||
$do_dequote
|
||||
$do_defcolor
|
||||
$do_emptytags
|
||||
$do_javascript
|
||||
$do_htmldefaults
|
||||
$do_lowercasetags
|
||||
$do_defbaseurl
|
||||
/;
|
||||
|
||||
$do_whitespace = 1;
|
||||
$do_shortertags = 1;
|
||||
$do_meta = "generator formatter";
|
||||
$do_blink = 1;
|
||||
$do_contenttype = 1;
|
||||
$do_comments = 1;
|
||||
$do_entities = 1;
|
||||
$do_dequote = 1;
|
||||
$do_defcolor = 1;
|
||||
$do_emptytags = 'b i font center';
|
||||
$do_javascript = 1;
|
||||
$do_htmldefaults = 1;
|
||||
$do_lowercasetags = 1;
|
||||
$do_defbaseurl = '';
|
||||
|
||||
sub strip {
|
||||
my($self, $options) = @_;
|
||||
|
||||
my $h = $self->{'DATA'};
|
||||
my $level = $self->{'LEVEL'};
|
||||
|
||||
# Select a set of options based on $level, and then modify based on
|
||||
# user supplied options.
|
||||
|
||||
_level_defaults($level);
|
||||
|
||||
if(defined($options)) {
|
||||
no strict 'refs';
|
||||
for (keys(%$options)) {
|
||||
${"do_" . lc($_)} = $options->{$_} if defined ${"do_" . lc($_)};
|
||||
}
|
||||
}
|
||||
|
||||
if ($do_shortertags) {
|
||||
$$h =~ s,<strong>,<b>,sgi;
|
||||
$$h =~ s,</strong>,</b>,sgi;
|
||||
$$h =~ s,<em>,<i>,sgi;
|
||||
$$h =~ s,</em>,</i>,sgi;
|
||||
}
|
||||
|
||||
if ($do_whitespace) {
|
||||
$$h =~ s,[\r\n]+,\n,sg; # Carriage/LF -> LF
|
||||
$$h =~ s,\s+\n,\n,sg; # empty line
|
||||
$$h =~ s,\n\s+<,\n<,sg; # space before tag
|
||||
$$h =~ s,\n\s+,\n ,sg; # other spaces
|
||||
|
||||
$$h =~ s,>\n\s*<,><,sg; # LF/spaces between tags..
|
||||
|
||||
# Remove excess spaces within tags.. note, we could parse out the elements
|
||||
# and rewrite for excess spaces between elements. perhaps next version.
|
||||
# removed due to problems with > and < in tag elements..
|
||||
#$$h =~ s,\s+>,>,sg;
|
||||
#$$h =~ s,<\s+,<,sg;
|
||||
# do this again later..
|
||||
}
|
||||
|
||||
if ($do_entities) {
|
||||
$$h =~ s,",\",sg;
|
||||
# Simplify long entity names if using default charset...
|
||||
$$h =~ m,charset=([^\"]+)\",;
|
||||
if (!defined($1) || ($1 eq 'iso-8859-1')) {
|
||||
$$h =~ s,&([A-z]+);,($_ENTITIES{$1}) ? chr($_ENTITIES{$1}) : $&,sige;
|
||||
}
|
||||
}
|
||||
|
||||
if ($do_meta) {
|
||||
foreach my $m (split(/\s+/, $do_meta)) {
|
||||
$$h =~ s,<meta name="$m"[^>]*?>,,sig;
|
||||
}
|
||||
}
|
||||
if ($do_contenttype) {
|
||||
# Don't need this, since it is the default for most web servers
|
||||
# Also gets rid of 'blinking pages' in older versions of netscape.
|
||||
$$h =~ s,<meta http-equiv="Content-Type".*?content="text/html;.*?charset=iso-8859-1">,,sig;
|
||||
}
|
||||
|
||||
if ($do_defcolor) {
|
||||
$$h =~ s,(<[^<]+?color=['"]?\#[0-9A-Fa-f]+["']?),_defcolorcheck($&),sige;
|
||||
}
|
||||
if ($do_comments) {
|
||||
# don't strip server side includes..
|
||||
# try not to get javascript, or styles...
|
||||
$$h =~ s,<!--.*?-->,_commentcheck($&),sige;
|
||||
|
||||
# Remove javascript comments
|
||||
$$h =~ s,<script[^>]*(java|ecma)script[^>]*>.*?</script>,_jscomments($&),sige;
|
||||
}
|
||||
|
||||
if ($do_javascript) {
|
||||
#
|
||||
$$h =~ s,<script[^>]*(java|ecma)script[^>]*>.*?</script>,_javascript($&),sige;
|
||||
}
|
||||
|
||||
if ($do_blink) {
|
||||
$$h =~ s,<BLINK>,,sgi;
|
||||
$$h =~ s,</BLINK>,,sgi;
|
||||
}
|
||||
|
||||
if ($do_dequote) {
|
||||
while ($$h =~ s,<([A-z]+ [A-z]+=)(['"])([A-z0-9]+)\2(\s*?[^>]*?>),<$1$3$4,sig)
|
||||
{
|
||||
# Remove alphanumeric quotes. Note, breaks DTD..
|
||||
;
|
||||
}
|
||||
}
|
||||
# remove <b></b>, etc..
|
||||
if ($do_emptytags) {
|
||||
my $pat = $do_emptytags;
|
||||
$pat =~ s/\s+/|/g;
|
||||
|
||||
while ($$h =~ s,<($pat)(\s+[^>]*?)?>\s*</\1>,,siog){}
|
||||
|
||||
}
|
||||
if ($do_htmldefaults) {
|
||||
# Tables
|
||||
# seems to break things..
|
||||
#$$h =~ s,(<table[^>]*)\s+border=0([^>]*>),$1$2,sig;
|
||||
$$h =~ s,(<td[^>]*)\s+rowspan=1([^>]*>),$1$2,sig;
|
||||
$$h =~ s,(<td[^>]*)\s+colspan=1([^>]*>),$1$2,sig;
|
||||
|
||||
#
|
||||
|
||||
# P, TABLE tags are default left aligned..
|
||||
# lynx is inconsistent in this manner though..
|
||||
|
||||
$$h =~ s,<(P|table|td)( [^>]*)align=\"?left\"?([^>]*)>,<$1$2$3>,sig;
|
||||
|
||||
# OL start=1
|
||||
$$h =~ s,(<OL [^>]*)start=\"?1\"?([^>]*>),$1$2,sig;
|
||||
|
||||
# FORM
|
||||
$$h =~ s,(<form [^>]*)method=\"?get\"?([^>]*>),$1$2,sig;
|
||||
$$h =~ s,(<form [^>]*)enctype=\"application/x-www-form-urlencoded\"([^>]*>),$1$2,sig;
|
||||
|
||||
# hr
|
||||
$$h =~ s,(<hr [^>]*)align=\"?center\"?([^>]*>),$1$2,sig;
|
||||
$$h =~ s,(<hr [^>]*)width=\"?100%\"?([^>]*>),$1$2,sig;
|
||||
|
||||
# URLs
|
||||
$$h =~ s,(href|src)(=\"?http://[^/:]+):80/,$1$2/,sig;
|
||||
}
|
||||
|
||||
if ($do_whitespace) {
|
||||
# remove space within tags <center > becomes <center>
|
||||
$$h =~ s,\s+>,>,sg;
|
||||
$$h =~ s,<\s+,<,sg;
|
||||
# join lines with a space at the beginning/end of the line
|
||||
# and a line that begins with a tag
|
||||
$$h =~ s,>\n ,> ,sig;
|
||||
$$h =~ s, \n<, <,sig;
|
||||
}
|
||||
|
||||
if ($do_lowercasetags) {
|
||||
# translate tags to lowercase to (hopefully) improve compressability..
|
||||
|
||||
# simple tags <H1>, </H1> etc.
|
||||
$$h =~ s,(<[/]?[a-zA-Z][a-zA-Z0-9_-]*\s*>),\L$1\E,sg;
|
||||
|
||||
# the rest..
|
||||
$$h =~ s/(<[a-zA-Z][a-zA-Z0-9_-]*)(\s+.*?>)/_lowercasetag($1,$2)/sge;
|
||||
}
|
||||
}
|
||||
|
||||
sub _lowercasetag {
|
||||
my($prefix, $body) = @_;
|
||||
$prefix =~ s/^(.+)$/\L$1\E/;
|
||||
$body =~ s/(\s+[a-zA-Z][a-zA-Z0-9_-]*)(\s*=\s*[^"\s]+|\s*=\s*"[^"]*"|>|\s)/\L$1\E$2/sg;
|
||||
return $prefix.$body;
|
||||
}
|
||||
|
||||
# set options based on the level provided.. INTERNAL
|
||||
|
||||
sub _level_defaults($) {
|
||||
my ($level) = @_;
|
||||
|
||||
$do_whitespace = 1; # always do this...
|
||||
|
||||
# level 2
|
||||
$do_shortertags = ($level > 1) ? 1 : 0;
|
||||
$do_meta = ($level > 1) ? "generator formatter" : "";
|
||||
$do_contenttype = ($level > 1) ? 1 : 0;
|
||||
|
||||
# level 3
|
||||
$do_entities = ($level > 2) ? 1 : 0;
|
||||
$do_blink = ($level > 2) ? 1 : 0;
|
||||
|
||||
# level 4
|
||||
$do_comments = ($level > 3) ? 1 : 0;
|
||||
$do_dequote = ($level > 3) ? 1 : 0;
|
||||
$do_defcolor = ($level > 3) ? 1 : 0;
|
||||
$do_emptytags = ($level > 3) ? 'b i font center' : 0;
|
||||
$do_javascript = ($level > 3) ? 1 : 0;
|
||||
$do_htmldefaults = ($level > 3) ? 1 : 0;
|
||||
$do_lowercasetags = ($level > 3) ? 1 : 0;
|
||||
|
||||
# higher levels reserved for more intensive optimizations.
|
||||
}
|
||||
|
||||
######################################################################
|
||||
|
||||
=head2 compat()
|
||||
|
||||
This function improves the cross-platform compatibility of your HTML.
|
||||
Currently checks for the following problems:
|
||||
|
||||
=over 8
|
||||
|
||||
=item Insuring all IMG tags have ALT elements.
|
||||
|
||||
=item Use of Arial, Futura, or Verdana as a font face.
|
||||
|
||||
=item Positioning the <TITLE> tag immediately after the <head> tag.
|
||||
|
||||
=back
|
||||
|
||||
=cut
|
||||
|
||||
sub compat {
|
||||
my($self, $level, $options) = @_;
|
||||
|
||||
my $h = $self->{'DATA'};
|
||||
|
||||
$$h =~ s/face="arial"/face="arial,helvetica,sansserif"/sgi;
|
||||
$$h =~ s/face="(verdana|futura)"/face="$1,arial,helvetica,sansserif"/sgi;
|
||||
|
||||
# insure that <title> tag is directly after the <head> tag
|
||||
# Some search engines only search the first N chars. (PLweb for instance..)
|
||||
|
||||
if ($$h =~ s,<title>(.*)</title>,,si) {
|
||||
my $title = $1;
|
||||
$$h =~ s,<head>,<head><title>$title</title>,si;
|
||||
}
|
||||
|
||||
# Look for IMG without ALT tags.
|
||||
$$h =~ s/(<img[^>]+>)/_imgalt($1)/segi;
|
||||
}
|
||||
|
||||
sub _imgalt {
|
||||
my($tag) = @_;
|
||||
|
||||
$tag =~ s/>/ alt="">/ if ($tag !~ /alt=/i);
|
||||
return($tag);
|
||||
}
|
||||
|
||||
=head2 defrontpage();
|
||||
|
||||
This function converts pages created with Microsoft Frontpage to
|
||||
something a Unix server will understand a bit better. This function
|
||||
currently does the following:
|
||||
|
||||
=over 8
|
||||
|
||||
=item Converts Frontpage 'hit counters' into a unix specific format.
|
||||
|
||||
=item Removes some frontpage specific html comments
|
||||
|
||||
=back
|
||||
|
||||
=cut
|
||||
|
||||
|
||||
sub defrontpage {
|
||||
my($self) = @_;
|
||||
|
||||
my $h = $self->{'DATA'};
|
||||
|
||||
while ($$h =~ s,<img\sSRC="[\./]*_vti_bin/fpcount.exe(/.*/).Page=(.*?)\|.*?\s(.*?)>,<img src="/counter?link=$1$2" $3>,xis) {
|
||||
print "Converted a Hitcounter.. $1, $2, $3\n";
|
||||
}
|
||||
$$h =~ s,<!--(mstheme|msthemeseparator|msnavigation)-->,,sgx;
|
||||
}
|
||||
=back
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
=head2 Modules
|
||||
|
||||
FrontPage::Web, FrontPage::File
|
||||
|
||||
=head2 Web Sites
|
||||
|
||||
=over 6
|
||||
|
||||
=item Distribution Site - http://people.itu.int/~lindner/
|
||||
|
||||
=back
|
||||
|
||||
=head1 AUTHORS
|
||||
|
||||
Paul Lindner for the International Telecommunication Union (ITU)
|
||||
|
||||
=head1 COPYRIGHT
|
||||
|
||||
The HTML::Strip module is Copyright (c) 1998,99 by the ITU, Geneva Switzerland.
|
||||
All rights reserved.
|
||||
|
||||
You may distribute under the terms of either the GNU General Public
|
||||
License or the Artistic License, as specified in the Perl README file.
|
||||
|
||||
=cut
|
||||
|
||||
1;
|
||||
__END__
|
||||
500
database/perl/lib/HTML/Entities.pm
Normal file
500
database/perl/lib/HTML/Entities.pm
Normal file
@@ -0,0 +1,500 @@
|
||||
package HTML::Entities;
|
||||
|
||||
=head1 NAME
|
||||
|
||||
HTML::Entities - Encode or decode strings with HTML entities
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use HTML::Entities;
|
||||
|
||||
$a = "Våre norske tegn bør æres";
|
||||
decode_entities($a);
|
||||
encode_entities($a, "\200-\377");
|
||||
|
||||
For example, this:
|
||||
|
||||
$input = "vis-à-vis Beyoncé's naïve\npapier-mâché résumé";
|
||||
print encode_entities($input), "\n"
|
||||
|
||||
Prints this out:
|
||||
|
||||
vis-à-vis Beyoncé's naïve
|
||||
papier-mâché résumé
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
This module deals with encoding and decoding of strings with HTML
|
||||
character entities. The module provides the following functions:
|
||||
|
||||
=over 4
|
||||
|
||||
=item decode_entities( $string, ... )
|
||||
|
||||
This routine replaces HTML entities found in the $string with the
|
||||
corresponding Unicode character. Under perl 5.6 and earlier only
|
||||
characters in the Latin-1 range are replaced. Unrecognized
|
||||
entities are left alone.
|
||||
|
||||
If multiple strings are provided as argument they are each decoded
|
||||
separately and the same number of strings are returned.
|
||||
|
||||
If called in void context the arguments are decoded in-place.
|
||||
|
||||
This routine is exported by default.
|
||||
|
||||
=item _decode_entities( $string, \%entity2char )
|
||||
|
||||
=item _decode_entities( $string, \%entity2char, $expand_prefix )
|
||||
|
||||
This will in-place replace HTML entities in $string. The %entity2char
|
||||
hash must be provided. Named entities not found in the %entity2char
|
||||
hash are left alone. Numeric entities are expanded unless their value
|
||||
overflow.
|
||||
|
||||
The keys in %entity2char are the entity names to be expanded and their
|
||||
values are what they should expand into. The values do not have to be
|
||||
single character strings. If a key has ";" as suffix,
|
||||
then occurrences in $string are only expanded if properly terminated
|
||||
with ";". Entities without ";" will be expanded regardless of how
|
||||
they are terminated for compatibility with how common browsers treat
|
||||
entities in the Latin-1 range.
|
||||
|
||||
If $expand_prefix is TRUE then entities without trailing ";" in
|
||||
%entity2char will even be expanded as a prefix of a longer
|
||||
unrecognized name. The longest matching name in %entity2char will be
|
||||
used. This is mainly present for compatibility with an MSIE
|
||||
misfeature.
|
||||
|
||||
$string = "foo bar";
|
||||
_decode_entities($string, { nb => "@", nbsp => "\xA0" }, 1);
|
||||
print $string; # will print "foo bar"
|
||||
|
||||
This routine is exported by default.
|
||||
|
||||
=item encode_entities( $string )
|
||||
|
||||
=item encode_entities( $string, $unsafe_chars )
|
||||
|
||||
This routine replaces unsafe characters in $string with their entity
|
||||
representation. A second argument can be given to specify which characters to
|
||||
consider unsafe. The unsafe characters is specified using the regular
|
||||
expression character class syntax (what you find within brackets in regular
|
||||
expressions).
|
||||
|
||||
The default set of characters to encode are control chars, high-bit chars, and
|
||||
the C<< < >>, C<< & >>, C<< > >>, C<< ' >> and C<< " >> characters. But this,
|
||||
for example, would encode I<just> the C<< < >>, C<< & >>, C<< > >>, and C<< "
|
||||
>> characters:
|
||||
|
||||
$encoded = encode_entities($input, '<>&"');
|
||||
|
||||
and this would only encode non-plain ascii:
|
||||
|
||||
$encoded = encode_entities($input, '^\n\x20-\x25\x27-\x7e');
|
||||
|
||||
This routine is exported by default.
|
||||
|
||||
=item encode_entities_numeric( $string )
|
||||
|
||||
=item encode_entities_numeric( $string, $unsafe_chars )
|
||||
|
||||
This routine works just like encode_entities, except that the replacement
|
||||
entities are always C<&#xI<hexnum>;> and never C<&I<entname>;>. For
|
||||
example, C<encode_entities("r\xF4le")> returns "rôle", but
|
||||
C<encode_entities_numeric("r\xF4le")> returns "rôle".
|
||||
|
||||
This routine is I<not> exported by default. But you can always
|
||||
export it with C<use HTML::Entities qw(encode_entities_numeric);>
|
||||
or even C<use HTML::Entities qw(:DEFAULT encode_entities_numeric);>
|
||||
|
||||
=back
|
||||
|
||||
All these routines modify the string passed as the first argument, if
|
||||
called in a void context. In scalar and array contexts, the encoded or
|
||||
decoded string is returned (without changing the input string).
|
||||
|
||||
If you prefer not to import these routines into your namespace, you can
|
||||
call them as:
|
||||
|
||||
use HTML::Entities ();
|
||||
$decoded = HTML::Entities::decode($a);
|
||||
$encoded = HTML::Entities::encode($a);
|
||||
$encoded = HTML::Entities::encode_numeric($a);
|
||||
|
||||
The module can also export the %char2entity and the %entity2char
|
||||
hashes, which contain the mapping from all characters to the
|
||||
corresponding entities (and vice versa, respectively).
|
||||
|
||||
=head1 COPYRIGHT
|
||||
|
||||
Copyright 1995-2006 Gisle Aas. All rights reserved.
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the same terms as Perl itself.
|
||||
|
||||
=cut
|
||||
|
||||
use strict;
|
||||
use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION);
|
||||
use vars qw(%entity2char %char2entity);
|
||||
|
||||
require 5.004;
|
||||
require Exporter;
|
||||
@ISA = qw(Exporter);
|
||||
|
||||
@EXPORT = qw(encode_entities decode_entities _decode_entities);
|
||||
@EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric);
|
||||
|
||||
$VERSION = "3.64";
|
||||
sub Version { $VERSION; }
|
||||
|
||||
require HTML::Parser; # for fast XS implemented decode_entities
|
||||
|
||||
|
||||
%entity2char = (
|
||||
# Some normal chars that have special meaning in SGML context
|
||||
amp => '&', # ampersand
|
||||
'gt' => '>', # greater than
|
||||
'lt' => '<', # less than
|
||||
quot => '"', # double quote
|
||||
apos => "'", # single quote
|
||||
|
||||
# PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML
|
||||
AElig => chr(198), # capital AE diphthong (ligature)
|
||||
Aacute => chr(193), # capital A, acute accent
|
||||
Acirc => chr(194), # capital A, circumflex accent
|
||||
Agrave => chr(192), # capital A, grave accent
|
||||
Aring => chr(197), # capital A, ring
|
||||
Atilde => chr(195), # capital A, tilde
|
||||
Auml => chr(196), # capital A, dieresis or umlaut mark
|
||||
Ccedil => chr(199), # capital C, cedilla
|
||||
ETH => chr(208), # capital Eth, Icelandic
|
||||
Eacute => chr(201), # capital E, acute accent
|
||||
Ecirc => chr(202), # capital E, circumflex accent
|
||||
Egrave => chr(200), # capital E, grave accent
|
||||
Euml => chr(203), # capital E, dieresis or umlaut mark
|
||||
Iacute => chr(205), # capital I, acute accent
|
||||
Icirc => chr(206), # capital I, circumflex accent
|
||||
Igrave => chr(204), # capital I, grave accent
|
||||
Iuml => chr(207), # capital I, dieresis or umlaut mark
|
||||
Ntilde => chr(209), # capital N, tilde
|
||||
Oacute => chr(211), # capital O, acute accent
|
||||
Ocirc => chr(212), # capital O, circumflex accent
|
||||
Ograve => chr(210), # capital O, grave accent
|
||||
Oslash => chr(216), # capital O, slash
|
||||
Otilde => chr(213), # capital O, tilde
|
||||
Ouml => chr(214), # capital O, dieresis or umlaut mark
|
||||
THORN => chr(222), # capital THORN, Icelandic
|
||||
Uacute => chr(218), # capital U, acute accent
|
||||
Ucirc => chr(219), # capital U, circumflex accent
|
||||
Ugrave => chr(217), # capital U, grave accent
|
||||
Uuml => chr(220), # capital U, dieresis or umlaut mark
|
||||
Yacute => chr(221), # capital Y, acute accent
|
||||
aacute => chr(225), # small a, acute accent
|
||||
acirc => chr(226), # small a, circumflex accent
|
||||
aelig => chr(230), # small ae diphthong (ligature)
|
||||
agrave => chr(224), # small a, grave accent
|
||||
aring => chr(229), # small a, ring
|
||||
atilde => chr(227), # small a, tilde
|
||||
auml => chr(228), # small a, dieresis or umlaut mark
|
||||
ccedil => chr(231), # small c, cedilla
|
||||
eacute => chr(233), # small e, acute accent
|
||||
ecirc => chr(234), # small e, circumflex accent
|
||||
egrave => chr(232), # small e, grave accent
|
||||
eth => chr(240), # small eth, Icelandic
|
||||
euml => chr(235), # small e, dieresis or umlaut mark
|
||||
iacute => chr(237), # small i, acute accent
|
||||
icirc => chr(238), # small i, circumflex accent
|
||||
igrave => chr(236), # small i, grave accent
|
||||
iuml => chr(239), # small i, dieresis or umlaut mark
|
||||
ntilde => chr(241), # small n, tilde
|
||||
oacute => chr(243), # small o, acute accent
|
||||
ocirc => chr(244), # small o, circumflex accent
|
||||
ograve => chr(242), # small o, grave accent
|
||||
oslash => chr(248), # small o, slash
|
||||
otilde => chr(245), # small o, tilde
|
||||
ouml => chr(246), # small o, dieresis or umlaut mark
|
||||
szlig => chr(223), # small sharp s, German (sz ligature)
|
||||
thorn => chr(254), # small thorn, Icelandic
|
||||
uacute => chr(250), # small u, acute accent
|
||||
ucirc => chr(251), # small u, circumflex accent
|
||||
ugrave => chr(249), # small u, grave accent
|
||||
uuml => chr(252), # small u, dieresis or umlaut mark
|
||||
yacute => chr(253), # small y, acute accent
|
||||
yuml => chr(255), # small y, dieresis or umlaut mark
|
||||
|
||||
# Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96)
|
||||
copy => chr(169), # copyright sign
|
||||
reg => chr(174), # registered sign
|
||||
nbsp => chr(160), # non breaking space
|
||||
|
||||
# Additional ISO-8859/1 entities listed in rfc1866 (section 14)
|
||||
iexcl => chr(161),
|
||||
cent => chr(162),
|
||||
pound => chr(163),
|
||||
curren => chr(164),
|
||||
yen => chr(165),
|
||||
brvbar => chr(166),
|
||||
sect => chr(167),
|
||||
uml => chr(168),
|
||||
ordf => chr(170),
|
||||
laquo => chr(171),
|
||||
'not' => chr(172), # not is a keyword in perl
|
||||
shy => chr(173),
|
||||
macr => chr(175),
|
||||
deg => chr(176),
|
||||
plusmn => chr(177),
|
||||
sup1 => chr(185),
|
||||
sup2 => chr(178),
|
||||
sup3 => chr(179),
|
||||
acute => chr(180),
|
||||
micro => chr(181),
|
||||
para => chr(182),
|
||||
middot => chr(183),
|
||||
cedil => chr(184),
|
||||
ordm => chr(186),
|
||||
raquo => chr(187),
|
||||
frac14 => chr(188),
|
||||
frac12 => chr(189),
|
||||
frac34 => chr(190),
|
||||
iquest => chr(191),
|
||||
'times' => chr(215), # times is a keyword in perl
|
||||
divide => chr(247),
|
||||
|
||||
( $] > 5.007 ? (
|
||||
'OElig;' => chr(338),
|
||||
'oelig;' => chr(339),
|
||||
'Scaron;' => chr(352),
|
||||
'scaron;' => chr(353),
|
||||
'Yuml;' => chr(376),
|
||||
'fnof;' => chr(402),
|
||||
'circ;' => chr(710),
|
||||
'tilde;' => chr(732),
|
||||
'Alpha;' => chr(913),
|
||||
'Beta;' => chr(914),
|
||||
'Gamma;' => chr(915),
|
||||
'Delta;' => chr(916),
|
||||
'Epsilon;' => chr(917),
|
||||
'Zeta;' => chr(918),
|
||||
'Eta;' => chr(919),
|
||||
'Theta;' => chr(920),
|
||||
'Iota;' => chr(921),
|
||||
'Kappa;' => chr(922),
|
||||
'Lambda;' => chr(923),
|
||||
'Mu;' => chr(924),
|
||||
'Nu;' => chr(925),
|
||||
'Xi;' => chr(926),
|
||||
'Omicron;' => chr(927),
|
||||
'Pi;' => chr(928),
|
||||
'Rho;' => chr(929),
|
||||
'Sigma;' => chr(931),
|
||||
'Tau;' => chr(932),
|
||||
'Upsilon;' => chr(933),
|
||||
'Phi;' => chr(934),
|
||||
'Chi;' => chr(935),
|
||||
'Psi;' => chr(936),
|
||||
'Omega;' => chr(937),
|
||||
'alpha;' => chr(945),
|
||||
'beta;' => chr(946),
|
||||
'gamma;' => chr(947),
|
||||
'delta;' => chr(948),
|
||||
'epsilon;' => chr(949),
|
||||
'zeta;' => chr(950),
|
||||
'eta;' => chr(951),
|
||||
'theta;' => chr(952),
|
||||
'iota;' => chr(953),
|
||||
'kappa;' => chr(954),
|
||||
'lambda;' => chr(955),
|
||||
'mu;' => chr(956),
|
||||
'nu;' => chr(957),
|
||||
'xi;' => chr(958),
|
||||
'omicron;' => chr(959),
|
||||
'pi;' => chr(960),
|
||||
'rho;' => chr(961),
|
||||
'sigmaf;' => chr(962),
|
||||
'sigma;' => chr(963),
|
||||
'tau;' => chr(964),
|
||||
'upsilon;' => chr(965),
|
||||
'phi;' => chr(966),
|
||||
'chi;' => chr(967),
|
||||
'psi;' => chr(968),
|
||||
'omega;' => chr(969),
|
||||
'thetasym;' => chr(977),
|
||||
'upsih;' => chr(978),
|
||||
'piv;' => chr(982),
|
||||
'ensp;' => chr(8194),
|
||||
'emsp;' => chr(8195),
|
||||
'thinsp;' => chr(8201),
|
||||
'zwnj;' => chr(8204),
|
||||
'zwj;' => chr(8205),
|
||||
'lrm;' => chr(8206),
|
||||
'rlm;' => chr(8207),
|
||||
'ndash;' => chr(8211),
|
||||
'mdash;' => chr(8212),
|
||||
'lsquo;' => chr(8216),
|
||||
'rsquo;' => chr(8217),
|
||||
'sbquo;' => chr(8218),
|
||||
'ldquo;' => chr(8220),
|
||||
'rdquo;' => chr(8221),
|
||||
'bdquo;' => chr(8222),
|
||||
'dagger;' => chr(8224),
|
||||
'Dagger;' => chr(8225),
|
||||
'bull;' => chr(8226),
|
||||
'hellip;' => chr(8230),
|
||||
'permil;' => chr(8240),
|
||||
'prime;' => chr(8242),
|
||||
'Prime;' => chr(8243),
|
||||
'lsaquo;' => chr(8249),
|
||||
'rsaquo;' => chr(8250),
|
||||
'oline;' => chr(8254),
|
||||
'frasl;' => chr(8260),
|
||||
'euro;' => chr(8364),
|
||||
'image;' => chr(8465),
|
||||
'weierp;' => chr(8472),
|
||||
'real;' => chr(8476),
|
||||
'trade;' => chr(8482),
|
||||
'alefsym;' => chr(8501),
|
||||
'larr;' => chr(8592),
|
||||
'uarr;' => chr(8593),
|
||||
'rarr;' => chr(8594),
|
||||
'darr;' => chr(8595),
|
||||
'harr;' => chr(8596),
|
||||
'crarr;' => chr(8629),
|
||||
'lArr;' => chr(8656),
|
||||
'uArr;' => chr(8657),
|
||||
'rArr;' => chr(8658),
|
||||
'dArr;' => chr(8659),
|
||||
'hArr;' => chr(8660),
|
||||
'forall;' => chr(8704),
|
||||
'part;' => chr(8706),
|
||||
'exist;' => chr(8707),
|
||||
'empty;' => chr(8709),
|
||||
'nabla;' => chr(8711),
|
||||
'isin;' => chr(8712),
|
||||
'notin;' => chr(8713),
|
||||
'ni;' => chr(8715),
|
||||
'prod;' => chr(8719),
|
||||
'sum;' => chr(8721),
|
||||
'minus;' => chr(8722),
|
||||
'lowast;' => chr(8727),
|
||||
'radic;' => chr(8730),
|
||||
'prop;' => chr(8733),
|
||||
'infin;' => chr(8734),
|
||||
'ang;' => chr(8736),
|
||||
'and;' => chr(8743),
|
||||
'or;' => chr(8744),
|
||||
'cap;' => chr(8745),
|
||||
'cup;' => chr(8746),
|
||||
'int;' => chr(8747),
|
||||
'there4;' => chr(8756),
|
||||
'sim;' => chr(8764),
|
||||
'cong;' => chr(8773),
|
||||
'asymp;' => chr(8776),
|
||||
'ne;' => chr(8800),
|
||||
'equiv;' => chr(8801),
|
||||
'le;' => chr(8804),
|
||||
'ge;' => chr(8805),
|
||||
'sub;' => chr(8834),
|
||||
'sup;' => chr(8835),
|
||||
'nsub;' => chr(8836),
|
||||
'sube;' => chr(8838),
|
||||
'supe;' => chr(8839),
|
||||
'oplus;' => chr(8853),
|
||||
'otimes;' => chr(8855),
|
||||
'perp;' => chr(8869),
|
||||
'sdot;' => chr(8901),
|
||||
'lceil;' => chr(8968),
|
||||
'rceil;' => chr(8969),
|
||||
'lfloor;' => chr(8970),
|
||||
'rfloor;' => chr(8971),
|
||||
'lang;' => chr(9001),
|
||||
'rang;' => chr(9002),
|
||||
'loz;' => chr(9674),
|
||||
'spades;' => chr(9824),
|
||||
'clubs;' => chr(9827),
|
||||
'hearts;' => chr(9829),
|
||||
'diams;' => chr(9830),
|
||||
) : ())
|
||||
);
|
||||
|
||||
|
||||
# Make the opposite mapping
|
||||
while (my($entity, $char) = each(%entity2char)) {
|
||||
$entity =~ s/;\z//;
|
||||
$char2entity{$char} = "&$entity;";
|
||||
}
|
||||
delete $char2entity{"'"}; # only one-way decoding
|
||||
|
||||
# Fill in missing entities
|
||||
for (0 .. 255) {
|
||||
next if exists $char2entity{chr($_)};
|
||||
$char2entity{chr($_)} = "&#$_;";
|
||||
}
|
||||
|
||||
my %subst; # compiled encoding regexps
|
||||
|
||||
sub decode_entities_old
|
||||
{
|
||||
my $array;
|
||||
if (defined wantarray) {
|
||||
$array = [@_]; # copy
|
||||
} else {
|
||||
$array = \@_; # modify in-place
|
||||
}
|
||||
my $c;
|
||||
for (@$array) {
|
||||
s/(&\#(\d+);?)/$2 < 256 ? chr($2) : $1/eg;
|
||||
s/(&\#[xX]([0-9a-fA-F]+);?)/$c = hex($2); $c < 256 ? chr($c) : $1/eg;
|
||||
s/(&(\w+);?)/$entity2char{$2} || $1/eg;
|
||||
}
|
||||
wantarray ? @$array : $array->[0];
|
||||
}
|
||||
|
||||
sub encode_entities
|
||||
{
|
||||
return undef unless defined $_[0];
|
||||
my $ref;
|
||||
if (defined wantarray) {
|
||||
my $x = $_[0];
|
||||
$ref = \$x; # copy
|
||||
} else {
|
||||
$ref = \$_[0]; # modify in-place
|
||||
}
|
||||
if (defined $_[1] and length $_[1]) {
|
||||
unless (exists $subst{$_[1]}) {
|
||||
# Because we can't compile regex we fake it with a cached sub
|
||||
my $chars = $_[1];
|
||||
$chars =~ s,(?<!\\)([]/]),\\$1,g;
|
||||
$chars =~ s,(?<!\\)\\\z,\\\\,;
|
||||
my $code = "sub {\$_[0] =~ s/([$chars])/\$char2entity{\$1} || num_entity(\$1)/ge; }";
|
||||
$subst{$_[1]} = eval $code;
|
||||
die( $@ . " while trying to turn range: \"$_[1]\"\n "
|
||||
. "into code: $code\n "
|
||||
) if $@;
|
||||
}
|
||||
&{$subst{$_[1]}}($$ref);
|
||||
} else {
|
||||
# Encode control chars, high bit chars and '<', '&', '>', ''' and '"'
|
||||
$$ref =~ s/([^\n\r\t !\#\$%\(-;=?-~])/$char2entity{$1} || num_entity($1)/ge;
|
||||
}
|
||||
$$ref;
|
||||
}
|
||||
|
||||
sub encode_entities_numeric {
|
||||
local %char2entity;
|
||||
return &encode_entities; # a goto &encode_entities wouldn't work
|
||||
}
|
||||
|
||||
|
||||
sub num_entity {
|
||||
sprintf "&#x%X;", ord($_[0]);
|
||||
}
|
||||
|
||||
# Set up aliases
|
||||
*encode = \&encode_entities;
|
||||
*encode_numeric = \&encode_entities_numeric;
|
||||
*encode_numerically = \&encode_entities_numeric;
|
||||
*decode = \&decode_entities;
|
||||
|
||||
1;
|
||||
684
database/perl/lib/HTML/FillInForm.pm
Normal file
684
database/perl/lib/HTML/FillInForm.pm
Normal file
@@ -0,0 +1,684 @@
|
||||
package HTML::FillInForm;
|
||||
|
||||
use integer; # no floating point math so far!
|
||||
use strict; # and no funny business, either.
|
||||
|
||||
use Carp; # generate better errors with more context
|
||||
|
||||
# required for attr_encoded
|
||||
use HTML::Parser 3.26;
|
||||
|
||||
# required for UNIVERSAL->can
|
||||
require 5.005;
|
||||
|
||||
use vars qw($VERSION @ISA);
|
||||
$VERSION = '2.00';
|
||||
|
||||
@ISA = qw(HTML::Parser);
|
||||
|
||||
sub new {
|
||||
my ($class) = @_;
|
||||
my $self = bless {}, $class;
|
||||
$self->init;
|
||||
# tell HTML::Parser not to decode attributes
|
||||
$self->attr_encoded(1);
|
||||
return $self;
|
||||
}
|
||||
|
||||
# a few shortcuts to fill()
|
||||
sub fill_file { my $self = shift; return $self->fill('file' ,@_); }
|
||||
sub fill_arrayref { my $self = shift; return $self->fill('arrayref' ,@_); }
|
||||
sub fill_scalarref { my $self = shift; return $self->fill('scalarref',@_); }
|
||||
|
||||
# track the keys we support. Useful for file-name detection.
|
||||
sub _known_keys {
|
||||
return {
|
||||
scalarref => 1,
|
||||
arrayref => 1,
|
||||
fdat => 1,
|
||||
fobject => 1,
|
||||
file => 1,
|
||||
target => 1,
|
||||
fill_password => 1,
|
||||
ignore_fields => 1,
|
||||
disable_fields => 1,
|
||||
}
|
||||
}
|
||||
|
||||
sub fill {
|
||||
my $self = shift;
|
||||
|
||||
# If we are called as a class method, go ahead and call new().
|
||||
$self = $self->new if (not ref $self);
|
||||
|
||||
my %option;
|
||||
|
||||
# If the first arg is a scalarref, translate that to scalarref => $first_arg
|
||||
if (ref $_[0] eq 'SCALAR') {
|
||||
$option{scalarref} = shift;
|
||||
}
|
||||
elsif (ref $_[0] eq 'ARRAY') {
|
||||
$option{arrayref} = shift;
|
||||
}
|
||||
elsif (ref $_[0] eq 'GLOB') {
|
||||
$option{file} = shift;
|
||||
}
|
||||
elsif (ref $_[0]) {
|
||||
croak "data source is not a reference type we understand";
|
||||
}
|
||||
# Last chance, if the first arg isn't one of the known keys, we
|
||||
# assume it is a file name.
|
||||
elsif (not _known_keys()->{$_[0]} ) {
|
||||
$option{file} = shift;
|
||||
}
|
||||
else {
|
||||
# Should be a known key. Nothing to do.
|
||||
}
|
||||
|
||||
|
||||
# Now, check to see if the next arg is also a reference.
|
||||
my $data;
|
||||
if (ref $_[0]) {
|
||||
$data = shift;
|
||||
$data = [$data] unless ref $data eq 'ARRAY';
|
||||
|
||||
for my $source (@$data) {
|
||||
if (ref $source eq 'HASH') {
|
||||
push @{ $option{fdat} }, $source;
|
||||
}
|
||||
elsif (ref $source) {
|
||||
if ($source->can('param')) {
|
||||
push @{ $option{fobject} }, $source;
|
||||
}
|
||||
else {
|
||||
croak "data source $source does not supply a param method";
|
||||
}
|
||||
}
|
||||
elsif (defined $source) {
|
||||
croak "data source $source is not a hash or object reference";
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
# load in the rest of the options
|
||||
%option = (%option, @_);
|
||||
|
||||
|
||||
# As suggested in the docs, merge multiple fdats into one.
|
||||
if (ref $option{fdat} eq 'ARRAY') {
|
||||
my %merged;
|
||||
for my $hash (@{ $option{fdat} }) {
|
||||
for my $key (keys %$hash) {
|
||||
$merged{$key} = $hash->{$key};
|
||||
}
|
||||
}
|
||||
$option{'fdat'} = \%merged;
|
||||
}
|
||||
|
||||
|
||||
my %ignore_fields;
|
||||
%ignore_fields = map { $_ => 1 } ( ref $option{'ignore_fields'} eq 'ARRAY' )
|
||||
? @{ $option{ignore_fields} } : $option{ignore_fields} if exists( $option{ignore_fields} );
|
||||
$self->{ignore_fields} = \%ignore_fields;
|
||||
|
||||
my %disable_fields;
|
||||
%disable_fields = map { $_ => 1 } ( ref $option{'disable_fields'} eq 'ARRAY' )
|
||||
? @{ $option{disable_fields} } : $option{disable_fields} if exists( $option{disable_fields} );
|
||||
$self->{disable_fields} = \%disable_fields;
|
||||
|
||||
if (my $fdat = $option{fdat}){
|
||||
# Copy the structure to prevent side-effects.
|
||||
my %copy;
|
||||
keys %$fdat; # reset fdat if each or Dumper was called on fdat
|
||||
while(my($key, $val) = each %$fdat) {
|
||||
next if exists $ignore_fields{$key};
|
||||
$copy{ $key } = ref $val eq 'ARRAY' ? [ @$val ] : $val;
|
||||
}
|
||||
$self->{fdat} = \%copy;
|
||||
}
|
||||
|
||||
# We want the reference to these objects to go out of scope at the
|
||||
# end of the method.
|
||||
local $self->{objects} = [];
|
||||
if(my $objects = $option{fobject}){
|
||||
unless(ref($objects) eq 'ARRAY'){
|
||||
$objects = [ $objects ];
|
||||
}
|
||||
for my $object (@$objects){
|
||||
# make sure objects in 'param_object' parameter support param()
|
||||
defined($object->can('param')) or
|
||||
croak("HTML::FillInForm->fill called with fobject option, containing object of type " . ref($object) . " which lacks a param() method!");
|
||||
}
|
||||
|
||||
$self->{objects} = $objects;
|
||||
}
|
||||
if (my $target = $option{target}){
|
||||
$self->{'target'} = $target;
|
||||
}
|
||||
|
||||
if (defined($option{fill_password})){
|
||||
$self->{fill_password} = $option{fill_password};
|
||||
} else {
|
||||
$self->{fill_password} = 1;
|
||||
}
|
||||
|
||||
# make sure method has data to fill in HTML form with!
|
||||
unless(exists $self->{fdat} || $self->{objects}){
|
||||
croak("HTML::FillInForm->fillInForm() called without 'fobject' or 'fdat' parameter set");
|
||||
}
|
||||
|
||||
local $self->{object_param_cache};
|
||||
|
||||
if(my $file = $option{file}){
|
||||
$self->parse_file($file);
|
||||
} elsif (my $scalarref = $option{scalarref}){
|
||||
$self->parse($$scalarref);
|
||||
} elsif (my $arrayref = $option{arrayref}){
|
||||
for (@$arrayref){
|
||||
$self->parse($_);
|
||||
}
|
||||
}
|
||||
|
||||
$self->eof;
|
||||
return delete $self->{output};
|
||||
}
|
||||
|
||||
# handles opening HTML tags such as <input ...>
|
||||
sub start {
|
||||
my ($self, $tagname, $attr, $attrseq, $origtext) = @_;
|
||||
|
||||
# set the current form
|
||||
if ($tagname eq 'form') {
|
||||
$self->{object_param_cache} = {};
|
||||
if (exists $attr->{'name'} || exists $attr->{'id'}) {
|
||||
$self->{'current_form'} = $attr->{'name'} || $attr->{'id'};
|
||||
} else {
|
||||
# in case of previous one without </FORM>
|
||||
delete $self->{'current_form'};
|
||||
}
|
||||
}
|
||||
|
||||
# This form is not my target.
|
||||
if (exists $self->{'target'} &&
|
||||
(! exists $self->{'current_form'} ||
|
||||
$self->{'current_form'} ne $self->{'target'})) {
|
||||
$self->{'output'} .= $origtext;
|
||||
return;
|
||||
}
|
||||
|
||||
# HTML::Parser converts tagname to lowercase, so we don't need /i
|
||||
if ($self->{option_no_value}) {
|
||||
$self->{output} .= '>';
|
||||
delete $self->{option_no_value};
|
||||
}
|
||||
|
||||
# Check if we need to disable this field
|
||||
$attr->{disable} = 1
|
||||
if exists $attr->{'name'} and
|
||||
exists $self->{disable_fields}{ $attr->{'name'} } and
|
||||
$self->{disable_fields}{ $attr->{'name'} } and
|
||||
not ( exists $attr->{disable} and $attr->{disable} );
|
||||
if ($tagname eq 'input'){
|
||||
my $value = exists $attr->{'name'} ? $self->_get_param($attr->{'name'}) : undef;
|
||||
# force hidden fields to have a value
|
||||
$value = '' if exists($attr->{'type'}) && $attr->{'type'} eq 'hidden' && ! exists $attr->{'value'} && ! defined $value;
|
||||
if (defined($value)){
|
||||
$value = $self->escapeHTMLStringOrList($value);
|
||||
# check for input type, noting that default type is text
|
||||
if (!exists $attr->{'type'} ||
|
||||
$attr->{'type'} =~ /^(text|textfield|hidden|)$/i){
|
||||
if ( ref($value) eq 'ARRAY' ) {
|
||||
$value = shift @$value;
|
||||
$value = '' unless defined $value;
|
||||
}
|
||||
$attr->{'value'} = $value;
|
||||
} elsif (lc $attr->{'type'} eq 'password' && $self->{fill_password}) {
|
||||
if ( ref($value) eq 'ARRAY' ) {
|
||||
$value = shift @$value;
|
||||
$value = '' unless defined $value;
|
||||
}
|
||||
$attr->{'value'} = $value;
|
||||
} elsif (lc $attr->{'type'} eq 'radio'){
|
||||
if ( ref($value) eq 'ARRAY' ) {
|
||||
$value = $value->[0];
|
||||
$value = '' unless defined $value;
|
||||
}
|
||||
# value for radio boxes default to 'on', works with netscape
|
||||
$attr->{'value'} = 'on' unless exists $attr->{'value'};
|
||||
if ($attr->{'value'} eq $value){
|
||||
$attr->{'checked'} = 'checked';
|
||||
} else {
|
||||
delete $attr->{'checked'};
|
||||
}
|
||||
} elsif (lc $attr->{'type'} eq 'checkbox'){
|
||||
# value for checkboxes default to 'on', works with netscape
|
||||
$attr->{'value'} = 'on' unless exists $attr->{'value'};
|
||||
|
||||
delete $attr->{'checked'}; # Everything is unchecked to start
|
||||
$value = [ $value ] unless ref($value) eq 'ARRAY';
|
||||
foreach my $v ( @$value ) {
|
||||
if ( $attr->{'value'} eq $v ) {
|
||||
$attr->{'checked'} = 'checked';
|
||||
}
|
||||
}
|
||||
# } else {
|
||||
# warn(qq(Input field of unknown type "$attr->{type}": $origtext));
|
||||
}
|
||||
}
|
||||
$self->{output} .= "<$tagname";
|
||||
while (my ($key, $value) = each %$attr) {
|
||||
next if $key eq '/';
|
||||
$self->{output} .= sprintf qq( %s="%s"), $key, $value;
|
||||
}
|
||||
# extra space put here to work around Opera 6.01/6.02 bug
|
||||
$self->{output} .= ' /' if $attr->{'/'};
|
||||
$self->{output} .= ">";
|
||||
} elsif ($tagname eq 'option'){
|
||||
my $value = $self->_get_param($self->{selectName});
|
||||
$value = [ $value ] unless ( ref($value) eq 'ARRAY' );
|
||||
|
||||
if ( defined $value->[0] ){
|
||||
$value = $self->escapeHTMLStringOrList($value);
|
||||
delete $attr->{selected} if exists $attr->{selected};
|
||||
|
||||
if(defined($attr->{'value'})){
|
||||
# option tag has value attr - <OPTION VALUE="foo">bar</OPTION>
|
||||
|
||||
if ($self->{selectMultiple}){
|
||||
# check if the option tag belongs to a multiple option select
|
||||
foreach my $v ( grep { defined } @$value ) {
|
||||
if ( $attr->{'value'} eq $v ){
|
||||
$attr->{selected} = 'selected';
|
||||
}
|
||||
}
|
||||
} else {
|
||||
# if not every value of a fdat ARRAY belongs to a different select tag
|
||||
if (not $self->{selectSelected}){
|
||||
if ( $attr->{'value'} eq $value->[0]){
|
||||
shift @$value if ref($value) eq 'ARRAY';
|
||||
$attr->{selected} = 'selected';
|
||||
$self->{selectSelected} = 1; # remeber that an option tag is selected for this select tag
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
# option tag has no value attr - <OPTION>bar</OPTION>
|
||||
# save for processing under text handler
|
||||
$self->{option_no_value} = $value;
|
||||
}
|
||||
}
|
||||
$self->{output} .= "<$tagname";
|
||||
while (my ($key, $value) = each %$attr) {
|
||||
$self->{output} .= sprintf qq( %s="%s"), $key, $value;
|
||||
}
|
||||
unless ($self->{option_no_value}){
|
||||
# we can close option tag here
|
||||
$self->{output} .= ">";
|
||||
}
|
||||
} elsif ($tagname eq 'textarea'){
|
||||
if ($attr->{'name'} and defined (my $value = $self->_get_param($attr->{'name'}))){
|
||||
$value = $self->escapeHTMLStringOrList($value);
|
||||
$value = (shift @$value || '') if ref($value) eq 'ARRAY';
|
||||
# <textarea> foobar </textarea> -> <textarea> $value </textarea>
|
||||
# we need to set outputText to 'no' so that 'foobar' won't be printed
|
||||
$self->{outputText} = 'no';
|
||||
$self->{output} .= $origtext . $value;
|
||||
} else {
|
||||
$self->{output} .= $origtext;
|
||||
}
|
||||
} elsif ($tagname eq 'select'){
|
||||
$self->{selectName} = $attr->{'name'};
|
||||
if (defined $attr->{'multiple'}){
|
||||
$self->{selectMultiple} = 1; # helper var to remember if the select tag has the multiple attr set or not
|
||||
} else {
|
||||
$self->{selectMultiple} = 0;
|
||||
$self->{selectSelected} = 0; # helper var to remember if an option was already selected in the current select tag
|
||||
}
|
||||
$self->{output} .= $origtext;
|
||||
} else {
|
||||
$self->{output} .= $origtext;
|
||||
}
|
||||
}
|
||||
|
||||
sub _get_param {
|
||||
my ($self, $param) = @_;
|
||||
|
||||
return undef if $self->{ignore_fields}{$param};
|
||||
|
||||
return $self->{fdat}{$param} if exists $self->{fdat}{$param};
|
||||
|
||||
return $self->{object_param_cache}{$param} if exists $self->{object_param_cache}{$param};
|
||||
|
||||
# traverse the list in reverse order for backwards compatibility
|
||||
# with the previous implementation.
|
||||
for my $o (reverse @{$self->{objects}}) {
|
||||
my @v = $o->param($param);
|
||||
|
||||
next unless @v;
|
||||
|
||||
return $self->{object_param_cache}{$param} = @v > 1 ? \@v : $v[0];
|
||||
}
|
||||
|
||||
return undef;
|
||||
}
|
||||
|
||||
# handles non-html text
|
||||
sub text {
|
||||
my ($self, $origtext) = @_;
|
||||
# just output text, unless replaced value of <textarea> tag
|
||||
unless(exists $self->{outputText} && $self->{outputText} eq 'no'){
|
||||
if(exists $self->{option_no_value}){
|
||||
# dealing with option tag with no value - <OPTION>bar</OPTION>
|
||||
my $values = $self->{option_no_value};
|
||||
my $value = $origtext;
|
||||
$value =~ s/^\s+//;
|
||||
$value =~ s/\s+$//;
|
||||
foreach my $v ( @$values ) {
|
||||
if ( $value eq $v ) {
|
||||
$self->{output} .= ' selected="selected"';
|
||||
}
|
||||
}
|
||||
# close <OPTION> tag
|
||||
$self->{output} .= ">$origtext";
|
||||
delete $self->{option_no_value};
|
||||
} else {
|
||||
$self->{output} .= $origtext;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# handles closing HTML tags such as </textarea>
|
||||
sub end {
|
||||
my ($self, $tagname, $origtext) = @_;
|
||||
if ($self->{option_no_value}) {
|
||||
$self->{output} .= '>';
|
||||
delete $self->{option_no_value};
|
||||
}
|
||||
if($tagname eq 'select'){
|
||||
delete $self->{selectName};
|
||||
} elsif ($tagname eq 'textarea'){
|
||||
delete $self->{outputText};
|
||||
} elsif ($tagname eq 'form') {
|
||||
delete $self->{'current_form'};
|
||||
}
|
||||
$self->{output} .= $origtext;
|
||||
}
|
||||
|
||||
sub escapeHTMLStringOrList {
|
||||
my ($self, $toencode) = @_;
|
||||
|
||||
if (ref($toencode) eq 'ARRAY') {
|
||||
foreach my $elem (@$toencode) {
|
||||
$elem = $self->escapeHTML($elem);
|
||||
}
|
||||
return $toencode;
|
||||
} else {
|
||||
return $self->escapeHTML($toencode);
|
||||
}
|
||||
}
|
||||
|
||||
sub escapeHTML {
|
||||
my ($self, $toencode) = @_;
|
||||
|
||||
return undef unless defined($toencode);
|
||||
$toencode =~ s/&/&/g;
|
||||
$toencode =~ s/\"/"/g;
|
||||
$toencode =~ s/>/>/g;
|
||||
$toencode =~ s/</</g;
|
||||
return $toencode;
|
||||
}
|
||||
|
||||
sub comment {
|
||||
my ( $self, $text ) = @_;
|
||||
# if it begins with '[if ' and doesn't end with '<![endif]'
|
||||
# it's a "downlevel-revealed" conditional comment (stupid IE)
|
||||
# or
|
||||
# if it ends with '[endif]' then it's the end of a
|
||||
# "downlevel-revealed" conditional comment
|
||||
if(
|
||||
(
|
||||
( index($text, '[if ') == 0 )
|
||||
&&
|
||||
( $text !~ /<!\[endif\]$/ )
|
||||
)
|
||||
||
|
||||
( $text eq '[endif]' )
|
||||
) {
|
||||
$self->{output} .= '<!' . $text . '>';
|
||||
} else {
|
||||
$self->{output} .= '<!--' . $text . '-->';
|
||||
}
|
||||
}
|
||||
|
||||
sub process {
|
||||
my ( $self, $token0, $text ) = @_;
|
||||
$self->{output} .= $text;
|
||||
}
|
||||
|
||||
sub declaration {
|
||||
my ( $self, $text ) = @_;
|
||||
$self->{output} .= '<!' . $text . '>';
|
||||
}
|
||||
|
||||
1;
|
||||
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
HTML::FillInForm - Populates HTML Forms with data.
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
This module fills in an HTML form with data from a Perl data structure, allowing you
|
||||
to keep the HTML and Perl separate.
|
||||
|
||||
Here are two common use cases:
|
||||
|
||||
1. A user submits an HTML form without filling out a required field. You want
|
||||
to redisplay the form with all the previous data in it, to make it easy for the
|
||||
user to see and correct the error.
|
||||
|
||||
2. You have just retrieved a record from a database and need to display it in
|
||||
an HTML form.
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
Fill HTML form with data.
|
||||
|
||||
$output = HTML::FillInForm->fill( \$html, $q );
|
||||
$output = HTML::FillInForm->fill( \@html, [$q1,$q2] );
|
||||
$output = HTML::FillInForm->fill( \*HTML, \%data );
|
||||
$output = HTML::FillInForm->fill( 't.html', [\%data1,%data2] );
|
||||
|
||||
The HTML can be provided as a scalarref, arrayref, filehandle or file. The data can come from one or more
|
||||
hashrefs, or objects which support a param() method, like CGI.pm, L<Apache::Request|Apache::Request>, etc.
|
||||
|
||||
=head1 fill
|
||||
|
||||
The basic syntax is seen above the Synopsis. There are a few additional options.
|
||||
|
||||
=head2 Options
|
||||
|
||||
=head3 target => 'form1'
|
||||
|
||||
Suppose you have multiple forms in a html file and only want to fill in one.
|
||||
|
||||
$output = HTML::FillInForm->fill(\$html, $q, target => 'form1');
|
||||
|
||||
This will fill in only the form inside
|
||||
|
||||
<FORM name="form1"> ... </FORM>
|
||||
|
||||
=head3 fill_password => 0
|
||||
|
||||
Passwords are filled in by default. To disable:
|
||||
|
||||
fill_password => 0
|
||||
|
||||
=head3 ignore_fields => []
|
||||
|
||||
To disable the filling of some fields:
|
||||
|
||||
ignore_fields => ['prev','next']
|
||||
|
||||
=head3 disable_fields => []
|
||||
|
||||
To disable fields from being edited:
|
||||
|
||||
disable_fields => [ 'uid', 'gid' ]
|
||||
|
||||
=head2 File Upload fields
|
||||
|
||||
File upload fields cannot be supported directly. Workarounds include asking the
|
||||
user to re-attach any file uploads or fancy server-side storage and
|
||||
referencing. You are on your own.
|
||||
|
||||
=head2 Clearing Fields
|
||||
|
||||
Fields are cleared if you set their value to an empty string or empty arrayref but not undef:
|
||||
|
||||
# this will leave the form element foo untouched
|
||||
HTML::FillInForm->fill(\$html, { foo => undef });
|
||||
|
||||
# this will set clear the form element foo
|
||||
HTML::FillInForm->fill(\$html, { foo => "" });
|
||||
|
||||
It has been suggested to add a option to change the behavior so that undef
|
||||
values will clear the form elements. Patches welcome.
|
||||
|
||||
=head1 Old syntax
|
||||
|
||||
You probably need to read no further. The remaining docs concern the
|
||||
1.x era syntax, which is still supported.
|
||||
|
||||
=head2 new
|
||||
|
||||
Call C<new()> to create a new FillInForm object:
|
||||
|
||||
$fif = HTML::FillInForm->new;
|
||||
$fif->fill(...);
|
||||
|
||||
In theory, there is a slight performance benefit to calling C<new()> before C<fill()> if you make multiple
|
||||
calls to C<fill()> before you destroy the object. Benchmark before optimizing.
|
||||
|
||||
=head2 fill ( old syntax )
|
||||
|
||||
Instead of having your HTML and data types auto-detected, you can declare them explicitly in your
|
||||
call to C<fill()>:
|
||||
|
||||
HTML source options:
|
||||
|
||||
arrayref => @html
|
||||
scalarref => $html
|
||||
file => \*HTML
|
||||
file => 't.html'
|
||||
|
||||
Fill Data options:
|
||||
|
||||
fobject => $data_obj # with param() method
|
||||
fdat => \%data
|
||||
|
||||
Additional methods are also available:
|
||||
|
||||
fill_file(\*HTML,...);
|
||||
fill_file('t.html',...);
|
||||
fill_arrayref(\@html,...);
|
||||
fill_scalarref(\$html,...);
|
||||
|
||||
=head1 CALLING FROM OTHER MODULES
|
||||
|
||||
=head2 Apache::PageKit
|
||||
|
||||
To use HTML::FillInForm in L<Apache::PageKit> is easy. It is
|
||||
automatically called for any page that includes a <form> tag.
|
||||
It can be turned on or off by using the C<fill_in_form> configuration
|
||||
option.
|
||||
|
||||
=head2 Apache::ASP v2.09 and above
|
||||
|
||||
HTML::FillInForm is now integrated with Apache::ASP. To activate, use
|
||||
|
||||
PerlSetVar FormFill 1
|
||||
$Response->{FormFill} = 1
|
||||
|
||||
=head2 HTML::Mason
|
||||
|
||||
Using HTML::FillInForm from HTML::Mason is covered in the FAQ on
|
||||
the masonhq.com website at
|
||||
L<http://www.masonhq.com/?FAQ:HTTPAndHTML#h-how_can_i_populate_form_values_automatically_>
|
||||
|
||||
=head1 VERSION
|
||||
|
||||
This documentation describes HTML::FillInForm module version 2.00
|
||||
|
||||
=head1 SECURITY
|
||||
|
||||
Note that you might want to think about caching issues if you have password
|
||||
fields on your page. There is a discussion of this issue at
|
||||
|
||||
http://www.perlmonks.org/index.pl?node_id=70482
|
||||
|
||||
In summary, some browsers will cache the output of CGI scripts, and you
|
||||
can control this by setting the Expires header. For example, use
|
||||
C<-expires> in L<CGI.pm> or set C<browser_cache> to I<no> in
|
||||
Config.xml file of L<Apache::PageKit>.
|
||||
|
||||
=head1 TRANSLATION
|
||||
|
||||
Kato Atsushi has translated these docs into Japanese, available from
|
||||
|
||||
http://perldoc.jp
|
||||
|
||||
=head1 BUGS
|
||||
|
||||
Please submit any bug reports to tjmather@maxmind.com.
|
||||
|
||||
=head1 NOTES
|
||||
|
||||
Requires Perl 5.005 and L<HTML::Parser> version 3.26.
|
||||
|
||||
I wrote this module because I wanted to be able to insert CGI data
|
||||
into HTML forms,
|
||||
but without combining the HTML and Perl code. CGI.pm and Embperl allow you so
|
||||
insert CGI data into forms, but require that you mix HTML with Perl.
|
||||
|
||||
There is a nice review of the module available here:
|
||||
L<http://www.perlmonks.org/index.pl?node_id=274534>
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
(c) 2005 TJ Mather, tjmather@maxmind.com, L<http://www.maxmind.com/>
|
||||
|
||||
All rights reserved. This package is free software; you can
|
||||
redistribute it and/or modify it under the same terms as Perl itself.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<HTML::Parser|HTML::Parser>,
|
||||
L<Data::FormValidator|Data::FormValidato>,
|
||||
L<HTML::Template|HTML::Template>,
|
||||
L<Apache::PageKit|Apache::PageKit>
|
||||
|
||||
=head1 CREDITS
|
||||
|
||||
Fixes, Bug Reports, Docs have been generously provided by:
|
||||
|
||||
Tatsuhiko Miyagawa Joseph Yanni
|
||||
Boris Zentner Philip Mak
|
||||
Dave Rolsky Jost Krieger
|
||||
Patrick Michael Kane Gabriel Burka
|
||||
Ade Olonoh Bill Moseley
|
||||
Tom Lancaster James Tolley
|
||||
Martin H Sluka Dan Kubb
|
||||
Mark Stosberg Alexander Hartmaier
|
||||
Jonathan Swartz Paul Miller
|
||||
Trevor Schellhorn Anthony Ettinger
|
||||
Jim Miner Simon P. Ditner
|
||||
Paul Lindner Michael Peters
|
||||
Maurice Aubrey Trevor Schellhorn
|
||||
Andrew Creer
|
||||
|
||||
Thanks!
|
||||
112
database/perl/lib/HTML/Filter.pm
Normal file
112
database/perl/lib/HTML/Filter.pm
Normal file
@@ -0,0 +1,112 @@
|
||||
package HTML::Filter;
|
||||
|
||||
use strict;
|
||||
use vars qw(@ISA $VERSION);
|
||||
|
||||
require HTML::Parser;
|
||||
@ISA=qw(HTML::Parser);
|
||||
|
||||
$VERSION = "3.57";
|
||||
|
||||
sub declaration { $_[0]->output("<!$_[1]>") }
|
||||
sub process { $_[0]->output($_[2]) }
|
||||
sub comment { $_[0]->output("<!--$_[1]-->") }
|
||||
sub start { $_[0]->output($_[4]) }
|
||||
sub end { $_[0]->output($_[2]) }
|
||||
sub text { $_[0]->output($_[1]) }
|
||||
|
||||
sub output { print $_[1] }
|
||||
|
||||
1;
|
||||
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
HTML::Filter - Filter HTML text through the parser
|
||||
|
||||
=head1 NOTE
|
||||
|
||||
B<This module is deprecated.> The C<HTML::Parser> now provides the
|
||||
functionally of C<HTML::Filter> much more efficiently with the the
|
||||
C<default> handler.
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
require HTML::Filter;
|
||||
$p = HTML::Filter->new->parse_file("index.html");
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
C<HTML::Filter> is an HTML parser that by default prints the
|
||||
original text of each HTML element (a slow version of cat(1) basically).
|
||||
The callback methods may be overridden to modify the filtering for some
|
||||
HTML elements and you can override output() method which is called to
|
||||
print the HTML text.
|
||||
|
||||
C<HTML::Filter> is a subclass of C<HTML::Parser>. This means that
|
||||
the document should be given to the parser by calling the $p->parse()
|
||||
or $p->parse_file() methods.
|
||||
|
||||
=head1 EXAMPLES
|
||||
|
||||
The first example is a filter that will remove all comments from an
|
||||
HTML file. This is achieved by simply overriding the comment method
|
||||
to do nothing.
|
||||
|
||||
package CommentStripper;
|
||||
require HTML::Filter;
|
||||
@ISA=qw(HTML::Filter);
|
||||
sub comment { } # ignore comments
|
||||
|
||||
The second example shows a filter that will remove any E<lt>TABLE>s
|
||||
found in the HTML file. We specialize the start() and end() methods
|
||||
to count table tags and then make output not happen when inside a
|
||||
table.
|
||||
|
||||
package TableStripper;
|
||||
require HTML::Filter;
|
||||
@ISA=qw(HTML::Filter);
|
||||
sub start
|
||||
{
|
||||
my $self = shift;
|
||||
$self->{table_seen}++ if $_[0] eq "table";
|
||||
$self->SUPER::start(@_);
|
||||
}
|
||||
|
||||
sub end
|
||||
{
|
||||
my $self = shift;
|
||||
$self->SUPER::end(@_);
|
||||
$self->{table_seen}-- if $_[0] eq "table";
|
||||
}
|
||||
|
||||
sub output
|
||||
{
|
||||
my $self = shift;
|
||||
unless ($self->{table_seen}) {
|
||||
$self->SUPER::output(@_);
|
||||
}
|
||||
}
|
||||
|
||||
If you want to collect the parsed text internally you might want to do
|
||||
something like this:
|
||||
|
||||
package FilterIntoString;
|
||||
require HTML::Filter;
|
||||
@ISA=qw(HTML::Filter);
|
||||
sub output { push(@{$_[0]->{fhtml}}, $_[1]) }
|
||||
sub filtered_html { join("", @{$_[0]->{fhtml}}) }
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<HTML::Parser>
|
||||
|
||||
=head1 COPYRIGHT
|
||||
|
||||
Copyright 1997-1999 Gisle Aas.
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the same terms as Perl itself.
|
||||
|
||||
=cut
|
||||
1551
database/perl/lib/HTML/Form.pm
Normal file
1551
database/perl/lib/HTML/Form.pm
Normal file
File diff suppressed because it is too large
Load Diff
301
database/perl/lib/HTML/HeadParser.pm
Normal file
301
database/perl/lib/HTML/HeadParser.pm
Normal file
@@ -0,0 +1,301 @@
|
||||
package HTML::HeadParser;
|
||||
|
||||
=head1 NAME
|
||||
|
||||
HTML::HeadParser - Parse <HEAD> section of a HTML document
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
require HTML::HeadParser;
|
||||
$p = HTML::HeadParser->new;
|
||||
$p->parse($text) and print "not finished";
|
||||
|
||||
$p->header('Title') # to access <title>....</title>
|
||||
$p->header('Content-Base') # to access <base href="http://...">
|
||||
$p->header('Foo') # to access <meta http-equiv="Foo" content="...">
|
||||
$p->header('X-Meta-Author') # to access <meta name="author" content="...">
|
||||
$p->header('X-Meta-Charset') # to access <meta charset="...">
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
The C<HTML::HeadParser> is a specialized (and lightweight)
|
||||
C<HTML::Parser> that will only parse the E<lt>HEAD>...E<lt>/HEAD>
|
||||
section of an HTML document. The parse() method
|
||||
will return a FALSE value as soon as some E<lt>BODY> element or body
|
||||
text are found, and should not be called again after this.
|
||||
|
||||
Note that the C<HTML::HeadParser> might get confused if raw undecoded
|
||||
UTF-8 is passed to the parse() method. Make sure the strings are
|
||||
properly decoded before passing them on.
|
||||
|
||||
The C<HTML::HeadParser> keeps a reference to a header object, and the
|
||||
parser will update this header object as the various elements of the
|
||||
E<lt>HEAD> section of the HTML document are recognized. The following
|
||||
header fields are affected:
|
||||
|
||||
=over 4
|
||||
|
||||
=item Content-Base:
|
||||
|
||||
The I<Content-Base> header is initialized from the E<lt>base
|
||||
href="..."> element.
|
||||
|
||||
=item Title:
|
||||
|
||||
The I<Title> header is initialized from the E<lt>title>...E<lt>/title>
|
||||
element.
|
||||
|
||||
=item Isindex:
|
||||
|
||||
The I<Isindex> header will be added if there is a E<lt>isindex>
|
||||
element in the E<lt>head>. The header value is initialized from the
|
||||
I<prompt> attribute if it is present. If no I<prompt> attribute is
|
||||
given it will have '?' as the value.
|
||||
|
||||
=item X-Meta-Foo:
|
||||
|
||||
All E<lt>meta> elements containing a C<name> attribute will result in
|
||||
headers using the prefix C<X-Meta-> appended with the value of the
|
||||
C<name> attribute as the name of the header, and the value of the
|
||||
C<content> attribute as the pushed header value.
|
||||
|
||||
E<lt>meta> elements containing a C<http-equiv> attribute will result
|
||||
in headers as in above, but without the C<X-Meta-> prefix in the
|
||||
header name.
|
||||
|
||||
E<lt>meta> elements containing a C<charset> attribute will result in
|
||||
an C<X-Meta-Charset> header, using the value of the C<charset>
|
||||
attribute as the pushed header value.
|
||||
|
||||
=back
|
||||
|
||||
=head1 METHODS
|
||||
|
||||
The following methods (in addition to those provided by the
|
||||
superclass) are available:
|
||||
|
||||
=over 4
|
||||
|
||||
=cut
|
||||
|
||||
|
||||
require HTML::Parser;
|
||||
@ISA = qw(HTML::Parser);
|
||||
|
||||
use HTML::Entities ();
|
||||
|
||||
use strict;
|
||||
use vars qw($VERSION $DEBUG);
|
||||
#$DEBUG = 1;
|
||||
$VERSION = "3.62";
|
||||
|
||||
=item $hp = HTML::HeadParser->new
|
||||
|
||||
=item $hp = HTML::HeadParser->new( $header )
|
||||
|
||||
The object constructor. The optional $header argument should be a
|
||||
reference to an object that implement the header() and push_header()
|
||||
methods as defined by the C<HTTP::Headers> class. Normally it will be
|
||||
of some class that is a or delegates to the C<HTTP::Headers> class.
|
||||
|
||||
If no $header is given C<HTML::HeadParser> will create an
|
||||
C<HTTP::Headers> object by itself (initially empty).
|
||||
|
||||
=cut
|
||||
|
||||
sub new
|
||||
{
|
||||
my($class, $header) = @_;
|
||||
unless ($header) {
|
||||
require HTTP::Headers;
|
||||
$header = HTTP::Headers->new;
|
||||
}
|
||||
|
||||
my $self = $class->SUPER::new(api_version => 3,
|
||||
start_h => ["start", "self,tagname,attr"],
|
||||
end_h => ["end", "self,tagname"],
|
||||
text_h => ["text", "self,text"],
|
||||
ignore_elements => [qw(script style)],
|
||||
);
|
||||
$self->{'header'} = $header;
|
||||
$self->{'tag'} = ''; # name of active element that takes textual content
|
||||
$self->{'text'} = ''; # the accumulated text associated with the element
|
||||
$self;
|
||||
}
|
||||
|
||||
=item $hp->header;
|
||||
|
||||
Returns a reference to the header object.
|
||||
|
||||
=item $hp->header( $key )
|
||||
|
||||
Returns a header value. It is just a shorter way to write
|
||||
C<$hp-E<gt>header-E<gt>header($key)>.
|
||||
|
||||
=cut
|
||||
|
||||
sub header
|
||||
{
|
||||
my $self = shift;
|
||||
return $self->{'header'} unless @_;
|
||||
$self->{'header'}->header(@_);
|
||||
}
|
||||
|
||||
sub as_string # legacy
|
||||
{
|
||||
my $self = shift;
|
||||
$self->{'header'}->as_string;
|
||||
}
|
||||
|
||||
sub flush_text # internal
|
||||
{
|
||||
my $self = shift;
|
||||
my $tag = $self->{'tag'};
|
||||
my $text = $self->{'text'};
|
||||
$text =~ s/^\s+//;
|
||||
$text =~ s/\s+$//;
|
||||
$text =~ s/\s+/ /g;
|
||||
print "FLUSH $tag => '$text'\n" if $DEBUG;
|
||||
if ($tag eq 'title') {
|
||||
HTML::Entities::decode($text);
|
||||
$self->{'header'}->push_header(Title => $text);
|
||||
}
|
||||
$self->{'tag'} = $self->{'text'} = '';
|
||||
}
|
||||
|
||||
# This is an quote from the HTML3.2 DTD which shows which elements
|
||||
# that might be present in a <HEAD>...</HEAD>. Also note that the
|
||||
# <HEAD> tags themselves might be missing:
|
||||
#
|
||||
# <!ENTITY % head.content "TITLE & ISINDEX? & BASE? & STYLE? &
|
||||
# SCRIPT* & META* & LINK*">
|
||||
#
|
||||
# <!ELEMENT HEAD O O (%head.content)>
|
||||
#
|
||||
# From HTML 4.01:
|
||||
#
|
||||
# <!ENTITY % head.misc "SCRIPT|STYLE|META|LINK|OBJECT">
|
||||
# <!ENTITY % head.content "TITLE & BASE?">
|
||||
# <!ELEMENT HEAD O O (%head.content;) +(%head.misc;)>
|
||||
#
|
||||
# Added in HTML 5 as of WD-html5-20090423: noscript, command
|
||||
|
||||
sub start
|
||||
{
|
||||
my($self, $tag, $attr) = @_; # $attr is reference to a HASH
|
||||
print "START[$tag]\n" if $DEBUG;
|
||||
$self->flush_text if $self->{'tag'};
|
||||
if ($tag eq 'meta') {
|
||||
my $key = $attr->{'http-equiv'};
|
||||
if (!defined($key) || !length($key)) {
|
||||
if ($attr->{name}) {
|
||||
$key = "X-Meta-\u$attr->{name}";
|
||||
} elsif ($attr->{charset}) { # HTML 5 <meta charset="...">
|
||||
$key = "X-Meta-Charset";
|
||||
$self->{header}->push_header($key => $attr->{charset});
|
||||
return;
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
}
|
||||
$self->{'header'}->push_header($key => $attr->{content});
|
||||
} elsif ($tag eq 'base') {
|
||||
return unless exists $attr->{href};
|
||||
$self->{'header'}->push_header('Content-Base' => $attr->{href});
|
||||
} elsif ($tag eq 'isindex') {
|
||||
# This is a non-standard header. Perhaps we should just ignore
|
||||
# this element
|
||||
$self->{'header'}->push_header(Isindex => $attr->{prompt} || '?');
|
||||
} elsif ($tag =~ /^(?:title|noscript|object|command)$/) {
|
||||
# Just remember tag. Initialize header when we see the end tag.
|
||||
$self->{'tag'} = $tag;
|
||||
} elsif ($tag eq 'link') {
|
||||
return unless exists $attr->{href};
|
||||
# <link href="http:..." rel="xxx" rev="xxx" title="xxx">
|
||||
my $h_val = "<" . delete($attr->{href}) . ">";
|
||||
for (sort keys %{$attr}) {
|
||||
next if $_ eq "/"; # XHTML junk
|
||||
$h_val .= qq(; $_="$attr->{$_}");
|
||||
}
|
||||
$self->{'header'}->push_header(Link => $h_val);
|
||||
} elsif ($tag eq 'head' || $tag eq 'html') {
|
||||
# ignore
|
||||
} else {
|
||||
# stop parsing
|
||||
$self->eof;
|
||||
}
|
||||
}
|
||||
|
||||
sub end
|
||||
{
|
||||
my($self, $tag) = @_;
|
||||
print "END[$tag]\n" if $DEBUG;
|
||||
$self->flush_text if $self->{'tag'};
|
||||
$self->eof if $tag eq 'head';
|
||||
}
|
||||
|
||||
sub text
|
||||
{
|
||||
my($self, $text) = @_;
|
||||
print "TEXT[$text]\n" if $DEBUG;
|
||||
unless ($self->{first_chunk}) {
|
||||
# drop Unicode BOM if found
|
||||
if ($self->utf8_mode) {
|
||||
$text =~ s/^\xEF\xBB\xBF//;
|
||||
}
|
||||
else {
|
||||
$text =~ s/^\x{FEFF}//;
|
||||
}
|
||||
$self->{first_chunk}++;
|
||||
}
|
||||
my $tag = $self->{tag};
|
||||
if (!$tag && $text =~ /\S/) {
|
||||
# Normal text means start of body
|
||||
$self->eof;
|
||||
return;
|
||||
}
|
||||
return if $tag ne 'title';
|
||||
$self->{'text'} .= $text;
|
||||
}
|
||||
|
||||
BEGIN {
|
||||
*utf8_mode = sub { 1 } unless HTML::Entities::UNICODE_SUPPORT;;
|
||||
}
|
||||
|
||||
1;
|
||||
|
||||
__END__
|
||||
|
||||
=back
|
||||
|
||||
=head1 EXAMPLE
|
||||
|
||||
$h = HTTP::Headers->new;
|
||||
$p = HTML::HeadParser->new($h);
|
||||
$p->parse(<<EOT);
|
||||
<title>Stupid example</title>
|
||||
<base href="http://www.linpro.no/lwp/">
|
||||
Normal text starts here.
|
||||
EOT
|
||||
undef $p;
|
||||
print $h->title; # should print "Stupid example"
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<HTML::Parser>, L<HTTP::Headers>
|
||||
|
||||
The C<HTTP::Headers> class is distributed as part of the
|
||||
I<libwww-perl> package. If you don't have that distribution installed
|
||||
you need to provide the $header argument to the C<HTML::HeadParser>
|
||||
constructor with your own object that implements the documented
|
||||
protocol.
|
||||
|
||||
=head1 COPYRIGHT
|
||||
|
||||
Copyright 1996-2001 Gisle Aas. All rights reserved.
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the same terms as Perl itself.
|
||||
|
||||
=cut
|
||||
|
||||
185
database/perl/lib/HTML/LinkExtor.pm
Normal file
185
database/perl/lib/HTML/LinkExtor.pm
Normal file
@@ -0,0 +1,185 @@
|
||||
package HTML::LinkExtor;
|
||||
|
||||
require HTML::Parser;
|
||||
@ISA = qw(HTML::Parser);
|
||||
$VERSION = "3.60";
|
||||
|
||||
=head1 NAME
|
||||
|
||||
HTML::LinkExtor - Extract links from an HTML document
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
require HTML::LinkExtor;
|
||||
$p = HTML::LinkExtor->new(\&cb, "http://www.perl.org/");
|
||||
sub cb {
|
||||
my($tag, %links) = @_;
|
||||
print "$tag @{[%links]}\n";
|
||||
}
|
||||
$p->parse_file("index.html");
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
I<HTML::LinkExtor> is an HTML parser that extracts links from an
|
||||
HTML document. The I<HTML::LinkExtor> is a subclass of
|
||||
I<HTML::Parser>. This means that the document should be given to the
|
||||
parser by calling the $p->parse() or $p->parse_file() methods.
|
||||
|
||||
=cut
|
||||
|
||||
use strict;
|
||||
use HTML::Tagset ();
|
||||
|
||||
# legacy (some applications grabs this hash directly)
|
||||
use vars qw(%LINK_ELEMENT);
|
||||
*LINK_ELEMENT = \%HTML::Tagset::linkElements;
|
||||
|
||||
=over 4
|
||||
|
||||
=item $p = HTML::LinkExtor->new
|
||||
|
||||
=item $p = HTML::LinkExtor->new( $callback )
|
||||
|
||||
=item $p = HTML::LinkExtor->new( $callback, $base )
|
||||
|
||||
The constructor takes two optional arguments. The first is a reference
|
||||
to a callback routine. It will be called as links are found. If a
|
||||
callback is not provided, then links are just accumulated internally
|
||||
and can be retrieved by calling the $p->links() method.
|
||||
|
||||
The $base argument is an optional base URL used to absolutize all URLs found.
|
||||
You need to have the I<URI> module installed if you provide $base.
|
||||
|
||||
The callback is called with the lowercase tag name as first argument,
|
||||
and then all link attributes as separate key/value pairs. All
|
||||
non-link attributes are removed.
|
||||
|
||||
=cut
|
||||
|
||||
sub new
|
||||
{
|
||||
my($class, $cb, $base) = @_;
|
||||
my $self = $class->SUPER::new(
|
||||
start_h => ["_start_tag", "self,tagname,attr"],
|
||||
report_tags => [keys %HTML::Tagset::linkElements],
|
||||
);
|
||||
$self->{extractlink_cb} = $cb;
|
||||
if ($base) {
|
||||
require URI;
|
||||
$self->{extractlink_base} = URI->new($base);
|
||||
}
|
||||
$self;
|
||||
}
|
||||
|
||||
sub _start_tag
|
||||
{
|
||||
my($self, $tag, $attr) = @_;
|
||||
|
||||
my $base = $self->{extractlink_base};
|
||||
my $links = $HTML::Tagset::linkElements{$tag};
|
||||
$links = [$links] unless ref $links;
|
||||
|
||||
my @links;
|
||||
my $a;
|
||||
for $a (@$links) {
|
||||
next unless exists $attr->{$a};
|
||||
push(@links, $a, $base ? URI->new($attr->{$a}, $base)->abs($base)
|
||||
: $attr->{$a});
|
||||
}
|
||||
return unless @links;
|
||||
$self->_found_link($tag, @links);
|
||||
}
|
||||
|
||||
sub _found_link
|
||||
{
|
||||
my $self = shift;
|
||||
my $cb = $self->{extractlink_cb};
|
||||
if ($cb) {
|
||||
&$cb(@_);
|
||||
} else {
|
||||
push(@{$self->{'links'}}, [@_]);
|
||||
}
|
||||
}
|
||||
|
||||
=item $p->links
|
||||
|
||||
Returns a list of all links found in the document. The returned
|
||||
values will be anonymous arrays with the following elements:
|
||||
|
||||
[$tag, $attr => $url1, $attr2 => $url2,...]
|
||||
|
||||
The $p->links method will also truncate the internal link list. This
|
||||
means that if the method is called twice without any parsing
|
||||
between them the second call will return an empty list.
|
||||
|
||||
Also note that $p->links will always be empty if a callback routine
|
||||
was provided when the I<HTML::LinkExtor> was created.
|
||||
|
||||
=cut
|
||||
|
||||
sub links
|
||||
{
|
||||
my $self = shift;
|
||||
exists($self->{'links'}) ? @{delete $self->{'links'}} : ();
|
||||
}
|
||||
|
||||
# We override the parse_file() method so that we can clear the links
|
||||
# before we start a new file.
|
||||
sub parse_file
|
||||
{
|
||||
my $self = shift;
|
||||
delete $self->{'links'};
|
||||
$self->SUPER::parse_file(@_);
|
||||
}
|
||||
|
||||
=back
|
||||
|
||||
=head1 EXAMPLE
|
||||
|
||||
This is an example showing how you can extract links from a document
|
||||
received using LWP:
|
||||
|
||||
use LWP::UserAgent;
|
||||
use HTML::LinkExtor;
|
||||
use URI::URL;
|
||||
|
||||
$url = "http://www.perl.org/"; # for instance
|
||||
$ua = LWP::UserAgent->new;
|
||||
|
||||
# Set up a callback that collect image links
|
||||
my @imgs = ();
|
||||
sub callback {
|
||||
my($tag, %attr) = @_;
|
||||
return if $tag ne 'img'; # we only look closer at <img ...>
|
||||
push(@imgs, values %attr);
|
||||
}
|
||||
|
||||
# Make the parser. Unfortunately, we don't know the base yet
|
||||
# (it might be different from $url)
|
||||
$p = HTML::LinkExtor->new(\&callback);
|
||||
|
||||
# Request document and parse it as it arrives
|
||||
$res = $ua->request(HTTP::Request->new(GET => $url),
|
||||
sub {$p->parse($_[0])});
|
||||
|
||||
# Expand all image URLs to absolute ones
|
||||
my $base = $res->base;
|
||||
@imgs = map { $_ = url($_, $base)->abs; } @imgs;
|
||||
|
||||
# Print them out
|
||||
print join("\n", @imgs), "\n";
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<HTML::Parser>, L<HTML::Tagset>, L<LWP>, L<URI::URL>
|
||||
|
||||
=head1 COPYRIGHT
|
||||
|
||||
Copyright 1996-2001 Gisle Aas.
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the same terms as Perl itself.
|
||||
|
||||
=cut
|
||||
|
||||
1;
|
||||
1240
database/perl/lib/HTML/Parser.pm
Normal file
1240
database/perl/lib/HTML/Parser.pm
Normal file
File diff suppressed because it is too large
Load Diff
213
database/perl/lib/HTML/Perlinfo.pm
Normal file
213
database/perl/lib/HTML/Perlinfo.pm
Normal file
@@ -0,0 +1,213 @@
|
||||
package HTML::Perlinfo;
|
||||
BEGIN { %Seen = %INC }
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use Carp ();
|
||||
|
||||
use HTML::Perlinfo::Apache;
|
||||
use HTML::Perlinfo::Modules;
|
||||
use HTML::Perlinfo::Common;
|
||||
|
||||
use base qw(Exporter HTML::Perlinfo::Base);
|
||||
our @EXPORT = qw(perlinfo);
|
||||
|
||||
our $VERSION = '1.61';
|
||||
|
||||
sub perlinfo {
|
||||
my ($opt) = @_;
|
||||
$opt = 'INFO_ALL' unless $opt;
|
||||
|
||||
error_msg("Invalid perlinfo() parameter: @_")
|
||||
if (($opt !~ /^INFO_(?:ALL|GENERAL|CONFIG|VARIABLES|APACHE|MODULES|LICENSE|LOADED)$/) || @_ > 1);
|
||||
|
||||
$opt = lc $opt;
|
||||
my $p = HTML::Perlinfo->new();
|
||||
$p->$opt;
|
||||
|
||||
}
|
||||
%INC = %HTML::Perlinfo::Seen;
|
||||
1;
|
||||
__END__
|
||||
=pod
|
||||
|
||||
=head1 NAME
|
||||
|
||||
HTML::Perlinfo - Display a lot of Perl information in HTML format
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use HTML::Perlinfo;
|
||||
|
||||
perlinfo();
|
||||
|
||||
|
||||
use HTML::Perlinfo;
|
||||
use CGI qw(header);
|
||||
|
||||
$|++;
|
||||
|
||||
print header;
|
||||
perlinfo(INFO_MODULES);
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
This module outputs a large amount of information about your Perl installation in HTML. So far, this includes information about Perl compilation options, the Perl version, server information and environment, HTTP headers, OS version information, Perl modules, and more.
|
||||
|
||||
HTML::Perlinfo is aimed at Web developers, but almost anyone using Perl may find it useful. It is a valuable debugging tool as it contains all EGPCS (Environment, GET, POST, Cookie, Server) data. It will also work under taint mode.
|
||||
|
||||
The output may be customized by passing one of the following options.
|
||||
|
||||
=head1 OPTIONS
|
||||
|
||||
There are 8 options to pass to the perlinfo funtion. All of these options are also object methods. The key difference is their case: Captilize the option name when passing it to the function and use only lower-case letters when using the object-oriented approach.
|
||||
|
||||
=over
|
||||
|
||||
=item INFO_GENERAL
|
||||
|
||||
The Perl version, build date, and more.
|
||||
|
||||
=item INFO_VARIABLES
|
||||
|
||||
Shows all predefined variables from EGPCS (Environment, GET, POST, Cookie, Server).
|
||||
|
||||
=item INFO_CONFIG
|
||||
|
||||
All configuration values from config_sh. INFO_ALL shows only some values.
|
||||
|
||||
=item INFO_APACHE
|
||||
|
||||
Apache HTTP server information, including mod_perl information.
|
||||
|
||||
=item INFO_MODULES
|
||||
|
||||
All installed modules, their version number and description. INFO_ALL shows only core modules.
|
||||
Please also see L<HTML::Perlinfo::Modules>.
|
||||
|
||||
=item INFO_LOADED
|
||||
|
||||
Post-execution dump of loaded modules (plus INFO_VARIABLES). INFO_ALL shows only core modules. Please also see L<HTML::Perlinfo::Loaded>.
|
||||
|
||||
=item INFO_LICENSE
|
||||
|
||||
Perl license information.
|
||||
|
||||
=item INFO_ALL
|
||||
|
||||
Shows all of the above defaults. This is the default value.
|
||||
|
||||
=back
|
||||
|
||||
=head1 PROGRAMMING STYLE
|
||||
|
||||
There are two styles of programming with Perlinfo.pm, a function-oriented style and an object-oriented style.
|
||||
|
||||
Function-oriented style:
|
||||
|
||||
# Show all information, defaults to INFO_ALL
|
||||
perlinfo();
|
||||
|
||||
# Show only module information. This shows all installed modules.
|
||||
perlinfo(INFO_MODULES);
|
||||
|
||||
Object-oriented style:
|
||||
|
||||
$p = new HTML::Perlinfo;
|
||||
$p->info_all;
|
||||
|
||||
# You can also set the CSS values in the constructor!
|
||||
$p = HTML::Perlinfo->new(
|
||||
bg_image => 'http://i104.photobucket.com/albums/m176/perlinfo/camel.gif',
|
||||
bg_repeat => 'yes-repeat'
|
||||
);
|
||||
$p->info_all;
|
||||
|
||||
More examples ...
|
||||
|
||||
# This is wrong (no capitals)
|
||||
$p->INFO_MODULES;
|
||||
|
||||
# But this is correct
|
||||
perlinfo(INFO_MODULES);
|
||||
|
||||
# Ditto
|
||||
$p->info_modules;
|
||||
|
||||
=head1 CUSTOMIZING THE HTML
|
||||
|
||||
You can capture the HTML output and manipulate it or you can alter CSS elements with object attributes.
|
||||
|
||||
For further details and examples, please see the L<HTML documentation|HTML::Perlinfo::HTML> in the HTML::Perlinfo distribution.
|
||||
|
||||
=head1 SECURITY
|
||||
|
||||
Displaying detailed server information on the internet is not a good idea and HTML::Perlinfo reveals a lot of information about the local environment. While restricting what system users can publish online is wise, you can also hinder them from using the module by installing it outside of the usual module directories (see perldoc -q lib). Of course, preventing users from installing the module in their own home directories is another matter entirely.
|
||||
|
||||
=head1 REQUIREMENTS
|
||||
|
||||
HTML::Perlinfo does not require any non-core modules.
|
||||
|
||||
=head1 NOTES
|
||||
|
||||
INFO_APACHE relies soley on environment variables.
|
||||
|
||||
INFO_VARIABLES did not work correctly until version 1.52.
|
||||
|
||||
INFO_LOADED is the only option whose output cannot be assigned to a scalar.
|
||||
|
||||
Some might notice that HTML::Perlinfo shares the look and feel of the PHP function phpinfo. It was originally inspired by that function and was first released in 2004 as PHP::Perlinfo, which is no longer available on CPAN.
|
||||
|
||||
Since the module outputs HTML, you may want to use it in a CGI script, but you do not have to. Of course, some information, like HTTP headers, would not be available if you use the module at the command-line. If you decide to use this module in a CGI script, B<make sure you print out the content-type header beforehand>. For example:
|
||||
|
||||
use HTML::Perlinfo;
|
||||
|
||||
print "Content-type: text/html\n\n";
|
||||
perlinfo();
|
||||
|
||||
I prefer to use the header function from the CGI module:
|
||||
|
||||
use HTML::Perlinfo;
|
||||
use CGI qw(header);
|
||||
|
||||
print header;
|
||||
perlinfo();
|
||||
|
||||
In this example, I am flushing the buffer because I know that there will be a lot of modules:
|
||||
|
||||
use HTML::Perlinfo;
|
||||
use CGI qw(header);
|
||||
|
||||
$|++;
|
||||
|
||||
print header;
|
||||
perlinfo(INFO_MODULES);
|
||||
|
||||
HTML::Perlinfo stopped printing the header automatically as of version 1.43.
|
||||
|
||||
=head1 BUGS
|
||||
|
||||
Please report any bugs or feature requests to C<bug-html-perlinfo@rt.cpan.org>, or through the web interface at
|
||||
L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=HTML-Perlinfo>.
|
||||
I will be notified, and then you'll automatically be notified of progress on your bug as I make changes.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<Config>. You can also use "perl -V" to see a configuration summary at the command-line.
|
||||
|
||||
L<CGI::Carp::Fatals>, L<Apache::Status>, L<App::Info>, L<Probe::Perl>, L<Module::CoreList>, L<Module::Info>, among others.
|
||||
|
||||
Also included in the Perlinfo distribution: L<perlinfo>, L<HTML::Perlinfo::Loaded>, L<HTML::Perlinfo::Modules>
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Mike Accardo <mikeaccardo@yahoo.com>
|
||||
|
||||
=head1 COPYRIGHT
|
||||
|
||||
Copyright (c) 2004-9, Mike Accardo. All Rights Reserved.
|
||||
This module is free software. It may be used, redistributed
|
||||
and/or modified under the terms of the Perl Artistic License.
|
||||
|
||||
=cut
|
||||
|
||||
100
database/perl/lib/HTML/Perlinfo/Apache.pm
Normal file
100
database/perl/lib/HTML/Perlinfo/Apache.pm
Normal file
@@ -0,0 +1,100 @@
|
||||
package HTML::Perlinfo::Apache;
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
use HTML::Perlinfo::Common;
|
||||
|
||||
|
||||
|
||||
sub new {
|
||||
my %apache;
|
||||
my $env = ( $ENV{SERVER_SOFTWARE} || "" ) =~ /apache/i;
|
||||
my $mp = exists $ENV{MOD_PERL};
|
||||
$apache{'env'} = $env;
|
||||
$apache{'mp'} = $mp;
|
||||
bless \%apache;
|
||||
}
|
||||
|
||||
sub has {
|
||||
my ( $self, @opts ) = @_;
|
||||
for my $opt (@opts) {
|
||||
return 0 unless $self->{$opt};
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
sub print_apache {
|
||||
|
||||
my $self = shift;
|
||||
my @mods;
|
||||
my ($version, $hostname, $port, $mp_status) = ("<i>Not detected</i>") x 7;
|
||||
|
||||
$mp_status = 'enabled' if $self->has qw(mp);
|
||||
|
||||
($version) = $ENV{'SERVER_SOFTWARE'} =~ /(\d+[\.\d]*)/
|
||||
if (defined $ENV{'SERVER_SOFTWARE'} && $ENV{'SERVER_SOFTWARE'} =~ /\d+[\.\d]*/);
|
||||
|
||||
|
||||
return join '', print_table_row(2, "Apache Version", "$version"),
|
||||
(defined($ENV{'SERVER_NAME'}) && defined($ENV{'SERVER_PORT'})) ?
|
||||
print_table_row(2, "Hostname:Port", "$ENV{'SERVER_NAME'} : $ENV{'SERVER_PORT'}"):
|
||||
print_table_row(2, "Hostname:Port", "$hostname : $port"),
|
||||
print_table_row(2, "mod_perl", "$mp_status");
|
||||
}
|
||||
|
||||
sub print_modperl {
|
||||
|
||||
my ($version_status, $version_number) = ("<i>Not detected</i>") x 3;
|
||||
|
||||
$version_status = '1.0';
|
||||
$version_status = '2.0 or higher' if $ENV{MOD_PERL_API_VERSION} && $ENV{MOD_PERL_API_VERSION} == 2;
|
||||
($version_number) = $ENV{MOD_PERL} =~ /^\S+\/(\d+(?:[\.\_]\d+)+)/;
|
||||
$version_number =~ s/_//g;
|
||||
$version_number =~ s/(\.[^.]+)\./$1/g;
|
||||
unless ($version_status eq '2.0 or higher') {
|
||||
if ( $version_number >= 1.9901 ) {
|
||||
$version_status = '1.9 which is incompatible with 2.0';
|
||||
}
|
||||
}
|
||||
|
||||
return join '', print_box_start(0),
|
||||
"Running under mod_perl version $version_status (version number: $version_number)",
|
||||
print_box_end();
|
||||
|
||||
}
|
||||
|
||||
sub print_apache_environment {
|
||||
|
||||
|
||||
return join '', print_table_row(2, "DOCUMENT_ROOT", $ENV{'DOCUMENT_ROOT'}),
|
||||
print_table_row(2, "HTTP_ACCEPT", $ENV{'HTTP_ACCEPT'}),
|
||||
print_table_row(2, "HTTP_ACCEPT_CHARSET", $ENV{'HTTP_ACCEPT_CHARSET'}),
|
||||
print_table_row(2, "HTTP_ACCEPT_ENCODING", $ENV{'HTTP_ACCEPT_ENCODING'}),
|
||||
print_table_row(2, "HTTP_ACCEPT_LANGUAGE", $ENV{'HTTP_ACCEPT_LANGUAGE'}),
|
||||
print_table_row(2, "HTTP_CONNECTION", $ENV{'HTTP_CONNECTION'}),
|
||||
print_table_row(2, "HTTP_HOSTS", $ENV{'HTTP_HOSTS'}),
|
||||
print_table_row(2, "HTTP_KEEP_ALIVE", $ENV{'HTTP_KEEP_ALIVE'}),
|
||||
print_table_row(2, "HTTP_USER_AGENT", $ENV{'HTTP_USER_AGENT'}),
|
||||
print_table_row(2, "PATH", $ENV{'PATH'}),
|
||||
print_table_row(2, "REMOTE_ADDR", $ENV{'REMOTE_ADDR'}),
|
||||
print_table_row(2, "REMOTE_HOST", $ENV{'REMOTE_HOST'}),
|
||||
print_table_row(2, "REMOTE_PORT", $ENV{'REMOTE_PORT'}),
|
||||
print_table_row(2, "SCRIPT_FILENAME", $ENV{'SCRIPT_FILENAME'}),
|
||||
print_table_row(2, "SCRIPT_URI", $ENV{'SCRIPT_URI'}),
|
||||
print_table_row(2, "SCRIPT_URL", $ENV{'SCRIPT_URL'}),
|
||||
print_table_row(2, "SERVER_ADDR", $ENV{'SERVER_ADDR'}),
|
||||
print_table_row(2, "SERVER_ADMIN", $ENV{'SERVER_ADMIN'}),
|
||||
print_table_row(2, "SERVER_NAME", $ENV{'SERVER_NAME'}),
|
||||
print_table_row(2, "SERVER_PORT", $ENV{'SERVER_PORT'}),
|
||||
print_table_row(2, "SERVER_SIGNATURE", $ENV{'SERVER_SIGNATURE'}),
|
||||
print_table_row(2, "SERVER_SOFTWARE", $ENV{'SERVER_SOFTWARE'}),
|
||||
print_table_row(2, "GATEWAY_INTERFACE", $ENV{'GATEWAY_INTERFACE'}),
|
||||
print_table_row(2, "SERVER_PROTOCOL", $ENV{'SERVER_PROTOCOL'}),
|
||||
print_table_row(2, "REQUEST_METHOD", $ENV{'REQUEST_METHOD'}),
|
||||
print_table_row(2, "QUERY_STRING", $ENV{'QUERY_STRING'}),
|
||||
print_table_row(2, "REQUEST_URI", $ENV{'REQUEST_URI'}),
|
||||
print_table_row(2, "SCRIPT_NAME", $ENV{'SCRIPT_NAME'});
|
||||
|
||||
}
|
||||
1;
|
||||
|
||||
249
database/perl/lib/HTML/Perlinfo/Base.pm
Normal file
249
database/perl/lib/HTML/Perlinfo/Base.pm
Normal file
@@ -0,0 +1,249 @@
|
||||
package HTML::Perlinfo::Base;
|
||||
|
||||
|
||||
use HTML::Perlinfo::Common;
|
||||
use HTML::Perlinfo::General;
|
||||
use Carp ();
|
||||
use warnings;
|
||||
use strict;
|
||||
|
||||
sub new {
|
||||
my ($class, %params) = @_;
|
||||
my $self = {};
|
||||
$self->{full_page} = 1;
|
||||
$self->{title} = 0;
|
||||
$self->{bg_image} = '';
|
||||
$self->{bg_position} = 'center';
|
||||
$self->{bg_repeat} = 'no_repeat';
|
||||
$self->{bg_attribute} = 'fixed';
|
||||
$self->{bg_color} = '#ffffff';
|
||||
$self->{ft_family} = 'sans-serif';
|
||||
$self->{ft_color} = '#000000';
|
||||
$self->{lk_color} = '#000099';
|
||||
$self->{lk_decoration} = 'none';
|
||||
$self->{lk_bgcolor} = '';
|
||||
$self->{lk_hvdecoration} = 'underline';
|
||||
$self->{header_bgcolor} = '#9999cc';
|
||||
$self->{header_ftcolor} = '#000000';
|
||||
$self->{leftcol_bgcolor} = '#ccccff';
|
||||
$self->{leftcol_ftcolor} = '#000000';
|
||||
$self->{rightcol_bgcolor} = '#cccccc';
|
||||
$self->{rightcol_ftcolor} = '#000000';
|
||||
|
||||
foreach my $key (keys %params) {
|
||||
if (exists $self->{$key}) {
|
||||
$self->{$key} = $params{$key};
|
||||
}
|
||||
else {
|
||||
error_msg("$key is an invalid attribute");
|
||||
}
|
||||
}
|
||||
|
||||
bless $self, $class;
|
||||
return $self;
|
||||
|
||||
}
|
||||
|
||||
sub info_all {
|
||||
my $self = shift;
|
||||
my %param = @_;
|
||||
error_msg("invalid parameter") if (defined $_[0] && exists $param{'links'} && ref $param{'links'} ne 'ARRAY');
|
||||
$self->links(@{$param{'links'}}) if exists $param{'links'};
|
||||
my $html;
|
||||
$self->{title} = 'perlinfo(INFO_ALL)' unless $self->{title};
|
||||
$html .= $self->print_htmlhead() if $self->{full_page};
|
||||
$html .= print_general();
|
||||
$html .= print_variables();
|
||||
$html .= print_thesemodules('core');
|
||||
$html .= print_license();
|
||||
$html .= "</div></body></html>" if $self->{full_page};
|
||||
defined wantarray ? return $html : print $html;
|
||||
}
|
||||
sub info_general {
|
||||
my $self = shift;
|
||||
my %param = @_;
|
||||
error_msg("invalid parameter") if (defined $_[0] && exists $param{'links'} && ref $param{'links'} ne 'ARRAY');
|
||||
$self->links(@{$param{'links'}}) if exists $param{'links'};
|
||||
my $html;
|
||||
$self->{title} = 'perlinfo(INFO_GENERAL)' unless $self->{title};
|
||||
$html .= $self->print_htmlhead() if $self->{full_page};
|
||||
$html .= print_general('top');
|
||||
$html .= "</div></body></html>" if $self->{full_page};
|
||||
defined wantarray ? return $html : print $html;
|
||||
}
|
||||
|
||||
sub info_loaded {
|
||||
|
||||
my $self = shift;
|
||||
$self->{'title'} = 'perlinfo(INFO_LOADED)' unless $self->{'title'};
|
||||
my $html;
|
||||
$html .= $self->print_htmlhead() if $self->{'full_page'};
|
||||
|
||||
eval qq{
|
||||
|
||||
END {
|
||||
delete \$INC{'HTML/Perlinfo.pm'};
|
||||
\$html .= print_thesemodules('loaded',[values %INC]);
|
||||
\$html .= print_variables();
|
||||
\$html .= '</div></body></html>' if \$self->{'full_page'};
|
||||
print \$html;
|
||||
}
|
||||
|
||||
}; die $@ if $@;
|
||||
|
||||
}
|
||||
|
||||
sub info_modules {
|
||||
my $self = shift;
|
||||
my %param = @_;
|
||||
error_msg("invalid parameter") if (defined $_[0] && exists $param{'links'} && ref $param{'links'} ne 'ARRAY');
|
||||
$self->links(@{$param{'links'}}) if exists $param{'links'};
|
||||
my $html;
|
||||
$self->{title} = 'perlinfo(INFO_MODULES)' unless $self->{title};
|
||||
$html .= $self->print_htmlhead() if $self->{'full_page'};
|
||||
$html .= print_thesemodules('all');
|
||||
$html .= "</div></body></html>" if $self->{'full_page'};
|
||||
defined wantarray ? return $html : print $html;
|
||||
}
|
||||
sub info_config {
|
||||
my $self = shift;
|
||||
my %param = @_;
|
||||
error_msg("invalid parameter") if (defined $_[0] && exists $param{'links'} && ref $param{'links'} ne 'ARRAY');
|
||||
$self->links(@{$param{'links'}}) if exists $param{'links'};
|
||||
my $html;
|
||||
$self->{title} = 'perlinfo(INFO_CONFIG)' unless $self->{title};
|
||||
$html .= $self->print_htmlhead() if $self->{full_page};
|
||||
$html .= print_config('info_config');
|
||||
$html .= "</div></body></html>" if $self->{full_page};
|
||||
defined wantarray ? return $html : print $html;
|
||||
}
|
||||
sub info_apache {
|
||||
my $self = shift;
|
||||
my %param = @_;
|
||||
error_msg("invalid parameter") if (defined $_[0] && exists $param{'links'} && ref $param{'links'} ne 'ARRAY');
|
||||
$self->links(@{$param{'links'}}) if exists $param{'links'};
|
||||
my $html;
|
||||
$self->{title} = 'perlinfo(INFO_APACHE)' unless $self->{title};
|
||||
$html .= $self->print_htmlhead() if $self->{full_page};
|
||||
$html .= print_httpd();
|
||||
$html .= "</div></body></html>" if $self->{full_page};
|
||||
defined wantarray ? return $html : print $html;
|
||||
}
|
||||
sub info_variables {
|
||||
my $self = shift;
|
||||
my %param = @_;
|
||||
error_msg("invalid parameter") if (defined $_[0] && exists $param{'links'} && ref $param{'links'} ne 'ARRAY');
|
||||
$self->links(@{$param{'links'}}) if exists $param{'links'};
|
||||
my $html;
|
||||
$self->{title} = 'perlinfo(INFO_VARIABLES)' unless $self->{title};
|
||||
$html .= $self->print_htmlhead() if $self->{full_page};
|
||||
$html .= print_variables();
|
||||
$html .= "</div></body></html>" if $self->{full_page};
|
||||
defined wantarray ? return $html : print $html;
|
||||
}
|
||||
|
||||
sub info_license {
|
||||
my $self = shift;
|
||||
my %param = @_;
|
||||
error_msg("invalid parameter") if (defined $_[0] && exists $param{'links'} && ref $param{'links'} ne 'ARRAY');
|
||||
$self->links(@{$param{'links'}}) if exists $param{'links'};
|
||||
my $html;
|
||||
$self->{title} = 'perlinfo(INFO_LICENSE)' unless $self->{title};
|
||||
$html .= $self->print_htmlhead() if $self->{full_page};
|
||||
$html .= print_license();
|
||||
$html .= "</div></body></html>" if $self->{full_page};
|
||||
defined wantarray ? return $html : print $html;
|
||||
}
|
||||
|
||||
|
||||
sub print_htmlhead {
|
||||
|
||||
my $self = shift;
|
||||
|
||||
my $title = $self->{title};
|
||||
my $bg_image = $self->{bg_image};
|
||||
my $bg_position = $self->{bg_position};
|
||||
my $bg_repeat = $self->{bg_repeat};
|
||||
my $bg_attribute = $self->{bg_attribute};
|
||||
my $bg_color = $self->{bg_color};
|
||||
|
||||
my $ft_family = $self->{ft_family};
|
||||
my $ft_color = $self->{ft_color};
|
||||
my $lk_color = $self->{lk_color};
|
||||
my $lk_decoration = $self->{lk_decoration};
|
||||
my $lk_bgcolor = $self->{lk_bgcolor};
|
||||
my $lk_hvdecoration = $self->{lk_hvdecoration};
|
||||
|
||||
my $header_bgcolor = $self->{header_bgcolor};
|
||||
my $header_ftcolor = $self->{header_ftcolor};
|
||||
my $leftcol_bgcolor =$self->{leftcol_bgcolor};
|
||||
my $leftcol_ftcolor = $self->{leftcol_ftcolor};
|
||||
my $rightcol_bgcolor = $self->{rightcol_bgcolor};
|
||||
my $rightcol_ftcolor = $self->{rightcol_ftcolor};
|
||||
|
||||
my $html = <<"END_OF_HTML";
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
||||
<head>
|
||||
<style type="text/css"><!--
|
||||
body {
|
||||
background-color: $bg_color;
|
||||
background-image: url($bg_image);
|
||||
background-position: $bg_position;
|
||||
background-repeat: $bg_repeat;
|
||||
background-attachment: $bg_attribute;
|
||||
color: $ft_color;}
|
||||
body, td, th, h1, h2 {font-family: $ft_family;}
|
||||
pre {margin: 0px; font-family: monospace;}
|
||||
a:link {color: $lk_color; text-decoration: $lk_decoration; background-color: $lk_bgcolor;}
|
||||
a:hover {text-decoration: $lk_hvdecoration;}
|
||||
table {border-collapse: collapse;}
|
||||
.center {text-align: center;}
|
||||
.center table { margin-left: auto; margin-right: auto; text-align: left;}
|
||||
.center th { text-align: center !important; }
|
||||
td, th { border: 1px solid #000000; font-size: 75%; vertical-align: baseline;}
|
||||
.modules table {border: 0;}
|
||||
.modules td { border:0; font-size: 100%; vertical-align: baseline;}
|
||||
.modules th { border:0; font-size: 100%; vertical-align: baseline;}
|
||||
h1 {font-size: 150%;}
|
||||
h2 {font-size: 125%;}
|
||||
.p {text-align: left;}
|
||||
.e {background-color: $leftcol_bgcolor; font-weight: bold; color: $leftcol_ftcolor;}
|
||||
.h {background-color: $header_bgcolor; font-weight: bold; color: $header_ftcolor;}
|
||||
.v {background-color: $rightcol_bgcolor; color: $rightcol_ftcolor;}
|
||||
i {color: #666666; background-color: #cccccc;}
|
||||
img {float: right; border: 0px;}
|
||||
hr {width: 600px; background-color: #cccccc; border: 0px; height: 1px; color: #000000;}
|
||||
//--></style>
|
||||
<title>$title</title>
|
||||
</head>
|
||||
<body><div class="center">
|
||||
END_OF_HTML
|
||||
|
||||
defined wantarray ? return $html : print $html;
|
||||
}
|
||||
|
||||
sub links {
|
||||
|
||||
my $self = shift;
|
||||
my $args = process_args(@_, \&check_args);
|
||||
if (exists $args->{'0'}) {
|
||||
$HTML::Perlinfo::Common::links{'all'} = 0;
|
||||
}
|
||||
elsif (exists $args->{'1'}) {
|
||||
$HTML::Perlinfo::Common::links{'all'} = 1;
|
||||
}
|
||||
elsif (exists $args->{'docs'} && not exists $args->{'local'}) {
|
||||
$HTML::Perlinfo::Common::links{'docs'} = $args->{'docs'};
|
||||
}
|
||||
elsif (exists $args->{'local'} && not exists $args->{'docs'}) {
|
||||
$HTML::Perlinfo::Common::links{'local'} = $args->{'local'};
|
||||
}
|
||||
elsif (exists $args->{'docs'} && exists $args->{'local'}) {
|
||||
$HTML::Perlinfo::Common::links{'docs'} = $args->{'docs'},
|
||||
$HTML::Perlinfo::Common::links{'local'} = $args->{'local'},
|
||||
}
|
||||
}
|
||||
1;
|
||||
515
database/perl/lib/HTML/Perlinfo/Common.pm
Normal file
515
database/perl/lib/HTML/Perlinfo/Common.pm
Normal file
@@ -0,0 +1,515 @@
|
||||
package HTML::Perlinfo::Common;
|
||||
use CGI qw(escapeHTML);
|
||||
|
||||
our @ISA = qw(Exporter);
|
||||
our @EXPORT = qw(initialize_globals print_table_colspan_header print_table_row print_table_color_start print_table_color_end print_color_box print_table_row_color print_table_start print_table_end print_box_start print_box_end print_hr print_table_header print_section print_license add_link check_path check_args check_module_args perl_version release_date process_args error_msg match_string);
|
||||
require Exporter;
|
||||
|
||||
use Carp ();
|
||||
|
||||
our %links;
|
||||
|
||||
%links = (
|
||||
'all' => 1,
|
||||
'local' => 0,
|
||||
'docs' => 1,
|
||||
);
|
||||
|
||||
|
||||
##### The following is lifted from File::Which 0.05 by Per Einar Ellefsen.
|
||||
##### The check_path sub uses the which sub.
|
||||
#############
|
||||
use File::Spec;
|
||||
|
||||
my $Is_VMS = ($^O eq 'VMS');
|
||||
my $Is_MacOS = ($^O eq 'MacOS');
|
||||
my $Is_DOSish = (($^O eq 'MSWin32') or
|
||||
($^O eq 'dos') or
|
||||
($^O eq 'os2'));
|
||||
|
||||
# For Win32 systems, stores the extensions used for
|
||||
# executable files
|
||||
# For others, the empty string is used
|
||||
# because 'perl' . '' eq 'perl' => easier
|
||||
my @path_ext = ('');
|
||||
if ($Is_DOSish) {
|
||||
if ($ENV{PATHEXT} and $Is_DOSish) { # WinNT. PATHEXT might be set on Cygwin, but not used.
|
||||
push @path_ext, split ';', $ENV{PATHEXT};
|
||||
}
|
||||
else {
|
||||
push @path_ext, qw(.com .exe .bat); # Win9X or other: doesn't have PATHEXT, so needs hardcoded.
|
||||
}
|
||||
}
|
||||
elsif ($Is_VMS) {
|
||||
push @path_ext, qw(.exe .com);
|
||||
}
|
||||
|
||||
sub which {
|
||||
my ($exec) = @_;
|
||||
|
||||
return undef unless $exec;
|
||||
|
||||
my $all = wantarray;
|
||||
my @results = ();
|
||||
|
||||
# check for aliases first
|
||||
if ($Is_VMS) {
|
||||
my $symbol = `SHOW SYMBOL $exec`;
|
||||
chomp($symbol);
|
||||
if (!$?) {
|
||||
return $symbol unless $all;
|
||||
push @results, $symbol;
|
||||
}
|
||||
}
|
||||
if ($Is_MacOS) {
|
||||
my @aliases = split /\,/, $ENV{Aliases};
|
||||
foreach my $alias (@aliases) {
|
||||
# This has not been tested!!
|
||||
# PPT which says MPW-Perl cannot resolve `Alias $alias`,
|
||||
# let's just hope it's fixed
|
||||
if (lc($alias) eq lc($exec)) {
|
||||
chomp(my $file = `Alias $alias`);
|
||||
last unless $file; # if it failed, just go on the normal way
|
||||
return $file unless $all;
|
||||
push @results, $file;
|
||||
# we can stop this loop as if it finds more aliases matching,
|
||||
# it'll just be the same result anyway
|
||||
last;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
my @path = File::Spec->path();
|
||||
unshift @path, File::Spec->curdir if $Is_DOSish or $Is_VMS or $Is_MacOS;
|
||||
|
||||
for my $base (map { File::Spec->catfile($_, $exec) } @path) {
|
||||
for my $ext (@path_ext) {
|
||||
my $file = $base.$ext;
|
||||
|
||||
if ((-x $file or # executable, normal case
|
||||
($Is_MacOS || # MacOS doesn't mark as executable so we check -e
|
||||
($Is_DOSish and grep { $file =~ /$_$/i } @path_ext[1..$#path_ext])
|
||||
# DOSish systems don't pass -x on non-exe/bat/com files.
|
||||
# so we check -e. However, we don't want to pass -e on files
|
||||
# that aren't in PATHEXT, like README.
|
||||
and -e _)
|
||||
) and !-d _)
|
||||
{ # and finally, we don't want dirs to pass (as they are -x)
|
||||
|
||||
|
||||
return $file unless $all;
|
||||
push @results, $file; # Make list to return later
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if($all) {
|
||||
return @results;
|
||||
} else {
|
||||
return undef;
|
||||
}
|
||||
}
|
||||
|
||||
## End File::Which code
|
||||
|
||||
sub check_path {
|
||||
|
||||
return add_link('local', which("$_[0]")) if which("$_[0]");
|
||||
return "<i>not in path</i>";
|
||||
|
||||
}
|
||||
|
||||
sub match_string {
|
||||
my($module_name, $string) = @_;
|
||||
|
||||
my $result = 0;
|
||||
my @string = (ref $string eq 'ARRAY') ? @$string : ($string);
|
||||
foreach(@string) {
|
||||
$result = index(lc($module_name), lc($_));
|
||||
last if ($result != -1);
|
||||
}
|
||||
return ($result == -1) ? 0 : 1;
|
||||
|
||||
}
|
||||
|
||||
sub perl_version {
|
||||
my $version;
|
||||
if ($] >= 5.006) {
|
||||
$version = sprintf "%vd", $^V;
|
||||
}
|
||||
else { # else time to update Perl!
|
||||
$version = "$]";
|
||||
}
|
||||
return $version;
|
||||
}
|
||||
|
||||
sub release_date {
|
||||
|
||||
# when things escaped
|
||||
%released = (
|
||||
5.000 => '1994-10-17',
|
||||
5.001 => '1995-03-14',
|
||||
5.002 => '1996-02-96',
|
||||
5.00307 => '1996-10-10',
|
||||
5.004 => '1997-05-15',
|
||||
5.005 => '1998-07-22',
|
||||
5.00503 => '1999-03-28',
|
||||
5.00405 => '1999-04-29',
|
||||
5.006 => '2000-03-22',
|
||||
5.006001 => '2001-04-08',
|
||||
5.007003 => '2002-03-05',
|
||||
5.008 => '2002-07-19',
|
||||
5.008001 => '2003-09-25',
|
||||
5.009 => '2003-10-27',
|
||||
5.008002 => '2003-11-05',
|
||||
5.006002 => '2003-11-15',
|
||||
5.008003 => '2004-01-14',
|
||||
5.00504 => '2004-02-23',
|
||||
5.009001 => '2004-03-16',
|
||||
5.008004 => '2004-04-21',
|
||||
5.008005 => '2004-07-19',
|
||||
5.008006 => '2004-11-27',
|
||||
5.009002 => '2005-04-01',
|
||||
5.008007 => '2005-05-30',
|
||||
5.009003 => '2006-01-28',
|
||||
5.008008 => '2006-01-31',
|
||||
5.009004 => '2006-08-15',
|
||||
5.009005 => '2007-07-07',
|
||||
5.010000 => '2007-12-18',
|
||||
);
|
||||
|
||||
# Do we have Module::Corelist
|
||||
eval{require Module::CoreList};
|
||||
if ($@) { # no
|
||||
return ($released{$]}) ? $released{$]} : "unknown";
|
||||
}
|
||||
else { # yes
|
||||
return ($Module::CoreList::released{$]}) ? $Module::CoreList::released{$]} : "unknown";
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
sub check_args {
|
||||
|
||||
my ($key, $value) = @_;
|
||||
my ($message, %allowed);
|
||||
@allowed{qw(docs local 0 1)} = ();
|
||||
|
||||
if (not exists $allowed{$key}) {
|
||||
$message = "$key is an invalid links parameter";
|
||||
}
|
||||
elsif ($key =~ /(?:docs|local)/ && $value !~ /^(?:0|1)$/i) {
|
||||
$message = "$value is an invalid value for the $key parameter in the links attribute";
|
||||
}
|
||||
|
||||
error_msg("$message") if $message;
|
||||
|
||||
}
|
||||
|
||||
sub check_module_args {
|
||||
|
||||
my ($key, $value) = @_;
|
||||
my ($message, %allowed);
|
||||
@allowed{qw(from columns sort_by color link show_only section full_page show_inc show_dir files_in)} = ();
|
||||
|
||||
if (not exists $allowed{$key}) {
|
||||
$message = "$key is an invalid print_modules parameter";
|
||||
}
|
||||
elsif ($key eq 'sort_by' && $value !~ /^(?:name|version)$/i) {
|
||||
$message = "$value is an invalid sort";
|
||||
}
|
||||
elsif ($key =~ /^(?:color|link|columns|files_in)$/ && ref($value) ne 'ARRAY') {
|
||||
$message = "The $key parameter value is not an array reference";
|
||||
}
|
||||
elsif ($key eq 'columns' && grep(!/^(?:name|version|desc|path|core)$/, @{$value})) {
|
||||
$message = "Invalid column name in the $key parameter";
|
||||
}
|
||||
elsif ($key eq 'color' && @{$value} <= 1) {
|
||||
$message = "You didn't specify a module to color";
|
||||
}
|
||||
elsif ($key eq 'link' && @{$value} <= 1 && $value->[0] != 0) {
|
||||
$message = "You didn't provide a URL for the $key parameter";
|
||||
}
|
||||
elsif ($key eq 'show_only' && (ref($value) ne 'ARRAY') && lc $value ne 'core') {
|
||||
$message = "$value is an invalid value for the $key parameter";
|
||||
}
|
||||
elsif ($key eq 'full_page' && $value != 0 && $value != 1 ) {
|
||||
$message = "$value is an invalid value for the $key parameter";
|
||||
}
|
||||
elsif ($key eq 'link' && ($value->[0] ne 'all' && $value->[0] != 0 && ref($value->[0]) ne 'ARRAY')) {
|
||||
$message = "Invalid first element in the $key parameter value";
|
||||
}
|
||||
error_msg("$message") if $message;
|
||||
}
|
||||
|
||||
|
||||
|
||||
sub process_args {
|
||||
# This sub returns a hash ref containing param args
|
||||
my %params;
|
||||
my $sub = pop @_ || die "No coderef provided\n"; # get the sub
|
||||
if (defined $_[0]) {
|
||||
while(my($key, $value) = splice @_, 0, 2) {
|
||||
$sub->($key, $value);
|
||||
if (exists $params{$key}){
|
||||
@key_value = ref(${$params{$key}}[0]) eq 'ARRAY' ? @{$params{$key}} : $params{$key};
|
||||
push @key_value,$value;
|
||||
$new_val = [@key_value];
|
||||
$params{$key} = $new_val;
|
||||
}
|
||||
else {
|
||||
$params{$key} = $value;
|
||||
}
|
||||
}
|
||||
}
|
||||
return \%params;
|
||||
}
|
||||
|
||||
sub error_msg {
|
||||
local $Carp::CarpLevel = $Carp::CarpLevel + 1;
|
||||
Carp::croak "User error: $_[0]";
|
||||
}
|
||||
|
||||
# HTML subs
|
||||
|
||||
sub print_table_colspan_header {
|
||||
|
||||
return sprintf("<tr class=\"h\"><th colspan=\"%d\">%s</th></tr>\n", $_[0], $_[1]);
|
||||
|
||||
}
|
||||
|
||||
sub print_table_row {
|
||||
|
||||
|
||||
my $num_cols = $_[0];
|
||||
my $HTML = "<tr>";
|
||||
|
||||
for ($i=0; $i<$num_cols; $i++) {
|
||||
|
||||
$HTML .= sprintf("<td class=\"%s\">", ($i==0 ? "e" : "v" ));
|
||||
|
||||
my $row_element = $_[$i+1];
|
||||
if ((not defined ($row_element)) || ($row_element !~ /\S/)) {
|
||||
$HTML .= "<i>no value</i>";
|
||||
} else {
|
||||
my $elem_esc;
|
||||
if ($row_element eq "<i>no value</i>") {
|
||||
$elem_esc = $row_element;
|
||||
} else {
|
||||
$elem_esc = escapeHTML($row_element);
|
||||
}
|
||||
$HTML .= "$elem_esc";
|
||||
|
||||
}
|
||||
|
||||
$HTML .= " </td>";
|
||||
|
||||
}
|
||||
|
||||
$HTML .= "</tr>\n";
|
||||
return $HTML;
|
||||
|
||||
}
|
||||
|
||||
|
||||
sub print_table_color_start {
|
||||
|
||||
return qq~<table class="modules" cellpadding=4 cellspacing=4 border=0 width="600"><tr>\n~;
|
||||
}
|
||||
|
||||
sub print_table_color_end {
|
||||
|
||||
return '</tr></table>';
|
||||
}
|
||||
|
||||
|
||||
sub print_color_box {
|
||||
|
||||
return qq ~<td>
|
||||
<table border=0>
|
||||
<tr><td>
|
||||
<table class="modules" border=0 width=50 height=50 align=left bgcolor="$_[0]">
|
||||
<tr bgcolor="$_[0]">
|
||||
<td color="$_[0]">
|
||||
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
</tr></td>
|
||||
<tr><td><small>$_[1]</small></td></tr>
|
||||
</table>
|
||||
</td>~;
|
||||
}
|
||||
|
||||
sub print_table_row_color {
|
||||
|
||||
my $num_cols = $_[0];
|
||||
my $HTML = $_[1] ? "<tr bgcolor=\"$_[1]\">" : "<tr>";
|
||||
|
||||
for ($i=0; $i<$num_cols; $i++) {
|
||||
|
||||
$HTML .= $_[1] ? "<td bgcolor=\"$_[1]\">" : sprintf("<td class=\"%s\">", ($i==0 ? "e" : "v" ));
|
||||
|
||||
my $row_element = $_[$i+2]; # start at the 2nd element
|
||||
if ((not defined ($row_element)) || ($row_element !~ /\S/)) {
|
||||
$HTML .= "<i>no value</i>";
|
||||
} else {
|
||||
my $elem_esc = $row_element;
|
||||
$HTML .= "$elem_esc";
|
||||
|
||||
}
|
||||
|
||||
$HTML .= " </td>";
|
||||
|
||||
}
|
||||
|
||||
$HTML .= "</tr>\n";
|
||||
return $HTML;
|
||||
}
|
||||
|
||||
sub print_table_start {
|
||||
|
||||
return "<table border=\"0\" cellpadding=\"3\" width=\"600\">\n";
|
||||
|
||||
}
|
||||
sub print_table_end {
|
||||
|
||||
return "</table><br />\n";
|
||||
|
||||
}
|
||||
sub print_box_start {
|
||||
|
||||
my $HTML = print_table_start();
|
||||
$HTML .= ($_[0] == 1) ? "<tr class=\"h\"><td>\n" : "<tr class=\"v\"><td>\n";
|
||||
return $HTML;
|
||||
}
|
||||
|
||||
|
||||
sub print_box_end {
|
||||
my $HTML = "</td></tr>\n";
|
||||
$HTML .= print_table_end();
|
||||
return $HTML;
|
||||
}
|
||||
|
||||
sub print_hr {
|
||||
return "<hr />\n";
|
||||
|
||||
}
|
||||
|
||||
|
||||
sub print_table_header {
|
||||
|
||||
my($num_cols) = $_[0];
|
||||
my $HTML = "<tr class=\"h\">";
|
||||
|
||||
my $i;
|
||||
for ($i=0; $i<$num_cols; $i++) {
|
||||
my $row_element = $_[$i+1];
|
||||
$row_element = " " if (!$row_element);
|
||||
$HTML .= "<th>$row_element</th>";
|
||||
}
|
||||
|
||||
return "$HTML</tr>\n";
|
||||
}
|
||||
|
||||
|
||||
sub print_section {
|
||||
|
||||
return "<h2>" . $_[0] . "</h2>\n";
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
sub print_perl_license {
|
||||
|
||||
return <<"END_OF_HTML";
|
||||
<p>
|
||||
This program is free software; you can redistribute it and/or modify it under the terms of
|
||||
either the Artistic License or the GNU General Public License, which may be found in the Perl 5 source kit.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
</p>
|
||||
<p>
|
||||
Complete documentation for Perl, including FAQ lists, should be found on
|
||||
this system using `man perl' or `perldoc perl'. If you have access to the
|
||||
Internet, point your browser at @{[ add_link('same', 'http://www.perl.org/')]}, the Perl directory.
|
||||
</p>
|
||||
END_OF_HTML
|
||||
|
||||
}
|
||||
|
||||
sub print_license {
|
||||
|
||||
return join '', print_section("Perl License"),
|
||||
print_box_start(0),
|
||||
print_perl_license(),
|
||||
print_box_end();
|
||||
}
|
||||
|
||||
|
||||
sub add_link {
|
||||
|
||||
my ($type, $value, $link) = @_;
|
||||
return $value unless $links{'all'};
|
||||
|
||||
if ($type eq "cpan") {
|
||||
|
||||
return $value if $link && $link->[0] =~ /^[0]$/;
|
||||
|
||||
if ($link) {
|
||||
if (ref $link->[0] eq 'ARRAY' && ref $link->[1] ne 'ARRAY') {
|
||||
foreach (@{$link->[0]}) {
|
||||
if ($_ eq 'all' or match_string($value,$_)==1) {
|
||||
return '<a href=' . $link->[1] . $value .
|
||||
qq~ title="Click here to see $value documentation [Opens in a new window]"
|
||||
target="_blank">$value</a> ~
|
||||
}
|
||||
}
|
||||
}
|
||||
elsif (ref $link->[0] eq 'ARRAY' && ref $link->[1] eq 'ARRAY'){
|
||||
foreach my $lv (@$link) {
|
||||
if (ref $lv->[0] eq 'ARRAY') {
|
||||
foreach(@{$lv->[0]}) {
|
||||
if ($_ eq 'all' or match_string($value,$_)==1) {
|
||||
return '<a href=' . $lv->[1] . $value .
|
||||
qq~ title="Click here to see $value documentation [Opens in a new window]"
|
||||
target="_blank">$value</a> ~
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
if ($lv->[0] eq 'all' or match_string($value,$lv->[0])==1) {
|
||||
return '<a href=' . $lv->[1] . $value .
|
||||
qq~ title="Click here to see $value documentation [Opens in a new window]"
|
||||
target="_blank">$value</a> ~
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
elsif ($link->[0] eq 'all' or match_string($value,$link->[0])==1) {
|
||||
return '<a href=' . $link->[1] . $value .
|
||||
qq~ title="Click here to see $value documentation [Opens in a new window]"
|
||||
target="_blank">$value</a> ~
|
||||
}
|
||||
}
|
||||
return qq~ <a href="http://search.cpan.org/perldoc?$value"
|
||||
title="Click here to see $value on CPAN [Opens in a new window]" target="_blank">$value</a> ~;
|
||||
}
|
||||
elsif ($type eq "config") {
|
||||
return $value unless $links{'docs'};
|
||||
my ($letter) = $value =~ /^(.)/;
|
||||
return qq! <a href="http://search.cpan.org/~aburlison/Solaris-PerlGcc-1.3/config/5.006001/5.10/sparc/Config.pm#$letter">$value</a> !;
|
||||
}
|
||||
elsif ($type eq "local") {
|
||||
return $value unless $links{'local'};
|
||||
return qq~ <a href="file://$value" title="Click here to see $value [Opens in a new window]" target="_blank">$value</a> ~;
|
||||
}
|
||||
elsif ($type eq "same") {
|
||||
return qq~ <a href="$value">$value</a> ~;
|
||||
}
|
||||
}
|
||||
|
||||
1;
|
||||
227
database/perl/lib/HTML/Perlinfo/General.pm
Normal file
227
database/perl/lib/HTML/Perlinfo/General.pm
Normal file
@@ -0,0 +1,227 @@
|
||||
package HTML::Perlinfo::General;
|
||||
|
||||
use base qw(Exporter);
|
||||
our @EXPORT = qw(print_httpd print_thesemodules print_config print_variables print_general);
|
||||
use CGI qw(url_param param);
|
||||
use POSIX qw(uname);
|
||||
use Config qw(%Config config_sh);
|
||||
use Net::Domain qw(hostname);
|
||||
use File::Spec;
|
||||
use HTML::Perlinfo::Common;
|
||||
use HTML::Perlinfo::Apache;
|
||||
|
||||
# Should search for Get, Post, Cookies, Session, and Environment.
|
||||
sub perl_print_gpcse_array {
|
||||
my $html;
|
||||
my($name) = @_;
|
||||
my ($gpcse_name, $gpcse_value);
|
||||
#POST names should be param() and get in url_param()
|
||||
if (defined($ENV{'QUERY_STRING'}) && length($ENV{'QUERY_STRING'}) >= 1) {
|
||||
foreach(url_param()) { $html .= print_table_row(2, "GET[\"$_\"]", url_param($_)); }
|
||||
}
|
||||
if (defined($ENV{'CONTENT_LENGTH'})) {
|
||||
foreach(param()) { $html .= print_table_row(2, "POST[\"$_\"]", param($_)); }
|
||||
}
|
||||
if (defined($ENV{'HTTP_COOKIE'})) {
|
||||
$cookies = $ENV{'HTTP_COOKIE'};
|
||||
@cookies = split(';',$cookies);
|
||||
foreach (@cookies) {
|
||||
($k,$v) = split('=',$_);
|
||||
$html .= print_table_row(2, "COOKIE[\"$k\"]", "$v");
|
||||
}
|
||||
}
|
||||
foreach my $key (sort(keys %ENV))
|
||||
{
|
||||
$gpcse_name = "$name" . '["' . "$key" . '"]';
|
||||
if ($ENV{$key}) {
|
||||
$gpcse_value = "$ENV{$key}";
|
||||
} else {
|
||||
$gpcse_value = "<i>no value</i>";
|
||||
}
|
||||
|
||||
|
||||
$html .= print_table_row(2, "$gpcse_name", "$gpcse_value");
|
||||
}
|
||||
return $html;
|
||||
}
|
||||
|
||||
sub print_variables {
|
||||
|
||||
return join '', print_section("Environment"),
|
||||
print_table_start(),
|
||||
print_table_header(2, "Variable", "Value"),
|
||||
((defined($ENV{'SERVER_SOFTWARE'})) ? perl_print_gpcse_array("SERVER") : perl_print_gpcse_array("ENV",)),
|
||||
print_table_end();
|
||||
}
|
||||
|
||||
sub print_config {
|
||||
|
||||
return join '', print_section("Perl Core"),
|
||||
print_table_start(),
|
||||
print_table_header(2, "Variable", "Value"),
|
||||
print_config_sh($_[0]),
|
||||
print_table_end();
|
||||
}
|
||||
|
||||
sub print_config_sh {
|
||||
|
||||
my @configs = qw/api_versionstring cc ccflags cf_by cf_email cf_time extensions installarchlib installbin installhtmldir installhtmlhelpdir installprefix installprefixexp installprivlib installscript installsitearch installsitebin known_extensions libc libperl libpth libs myarchname optimize osname osvers package perllibs perlpath pm_apiversion prefix prefixexp privlib privlibexp startperl version version_patchlevel_string xs_apiversion/;
|
||||
|
||||
return ($_[0] eq 'info_all') ?
|
||||
join '', map { print_table_row(2, add_link('config',$_), $Config{$_})} @configs
|
||||
: join '', map { print_table_row(2, map{
|
||||
if ($_ !~ /^'/) {
|
||||
add_link('config', $_);
|
||||
}
|
||||
else {
|
||||
if ($_ eq "\'\'" || $_ eq "\' \'" ) {
|
||||
my $value = "<i>no value</i>";
|
||||
$value;
|
||||
}
|
||||
else {
|
||||
$_;
|
||||
}
|
||||
}
|
||||
}split/=/,$_ )
|
||||
} split /\n/, config_sh();
|
||||
}
|
||||
|
||||
sub print_httpd {
|
||||
|
||||
return unless (defined $ENV{'SERVER_SOFTWARE'} && $ENV{'SERVER_SOFTWARE'} =~ /apache/i);
|
||||
my $a = HTML::Perlinfo::Apache->new();
|
||||
|
||||
my $html .= print_section("Apache");
|
||||
$html .= print_table_start();
|
||||
$html .= $a->print_apache() if $a->has qw(env);
|
||||
$html .= print_table_end();
|
||||
|
||||
$html .= $a->print_modperl() if $a->has qw(mp);
|
||||
|
||||
$html .= print_section("Apache Environment"),
|
||||
$html .= print_table_start();
|
||||
$html .= print_table_header(2, "Variable", "Value"),
|
||||
$html .= $a->print_apache_environment() if $a->has qw(env);
|
||||
$html .= print_table_end();
|
||||
|
||||
return $html;
|
||||
}
|
||||
|
||||
sub print_thesemodules {
|
||||
my $opt = shift;
|
||||
$m = HTML::Perlinfo::Modules->new();
|
||||
if ($opt eq 'core') {
|
||||
return $m->print_modules(show_only=>$opt, full_page=>0);
|
||||
}
|
||||
elsif ($opt eq 'all') {
|
||||
return $m->print_modules(section=>'All Perl modules', full_page=>0);
|
||||
}
|
||||
elsif ($opt eq 'loaded' && ref $_[0] eq 'ARRAY') {
|
||||
return $m->print_modules(section=>'Loaded Modules', files_in=>shift, full_page=>0);
|
||||
}
|
||||
else {
|
||||
die "internal function print_thesemodules has invalid arguments: @_";
|
||||
}
|
||||
}
|
||||
|
||||
sub print_general {
|
||||
|
||||
my $opt = shift || 'full';
|
||||
|
||||
my $html = print_box_start(1);
|
||||
$html .= sprintf("<h1 class=\"p\">Perl Version %s</h1><br style=\"clear:both\" />Release date: %s", perl_version(), release_date());
|
||||
|
||||
$html .= print_box_end();
|
||||
|
||||
$html .= print_table_start();
|
||||
$html .= print_table_row(2, "Currently running on", "@{[ (uname)[0..4] ]}");
|
||||
$html .= print_table_row(2, "Built for", "$Config{archname}");
|
||||
$html .= print_table_row(2, "Build date", "$Config{cf_time}");
|
||||
$html .= print_table_row(2, "Perl path", "$^X");
|
||||
|
||||
$html .= print_table_row(2, "Additional C Compiler Flags", "$Config{ccflags}");
|
||||
$html .= print_table_row(2, "Optimizer/Debugger Flags", "$Config{optimize}");
|
||||
|
||||
if (defined($ENV{'SERVER_SOFTWARE'})) {
|
||||
$html .= print_table_row(2, "Server API", "$ENV{'SERVER_SOFTWARE'}");
|
||||
}
|
||||
if ($Config{usethreads} && !$Config{useithreads} && !$Config{use5005threads}) {
|
||||
$html .= print_table_row(2, "Thread Support", "enabled (threads)");
|
||||
}
|
||||
elsif ($Config{useithreads} && !$Config{usethreads} && !$Config{use5005threads}) {
|
||||
$html .= print_table_row(2, "Thread Support", "enabled (ithreads)");
|
||||
}
|
||||
elsif ($Config{use5005threads} && !$Config{usethreads} && !$Config{useithreads}) {
|
||||
$html .= print_table_row(2, "Thread Support", "enabled (5005threads)");
|
||||
}
|
||||
elsif ($Config{usethreads} && $Config{useithreads} && !$Config{use5005threads}) {
|
||||
$html .= print_table_row(2, "Thread Support", "enabled (threads, ithreads)");
|
||||
}
|
||||
elsif ($Config{usethreads} && $Config{use5005threads} && !$Config{useithreads}) {
|
||||
$html .= print_table_row(2, "Thread Support", "enabled (threads, 5005threads)");
|
||||
}
|
||||
elsif ($Config{useithreads} && $Config{use5005threads} && !$Config{usethreads}) {
|
||||
$html .= print_table_row(2, "Thread Support", "enabled (ithreads, 5005threads)");
|
||||
}
|
||||
elsif ($Config{usethreads} && $Config{useithreads} && $Config{use5005threads}) {
|
||||
$html .= print_table_row(2, "Thread Support", "enabled (threads, ithreads, 5005threads)");
|
||||
}
|
||||
else {
|
||||
$html .= print_table_row(2, "Thread Support", "disabled (threads, ithreads, 5005threads)");
|
||||
}
|
||||
$html .= print_table_end();
|
||||
|
||||
$html .= print_box_start(0);
|
||||
|
||||
|
||||
$html .= "This is perl, v$Config{version} built for $Config{archname}<br />Copyright (c) 1987-@{[ sprintf '%d', (localtime)[5]+1900]}, Larry Wall";
|
||||
$html .= print_box_end();
|
||||
|
||||
return $html if $opt eq 'top';
|
||||
|
||||
$html .= print_hr();
|
||||
$html .= "<h1>Configuration</h1>\n";
|
||||
$html .= print_config('info_all');
|
||||
|
||||
$html .= join '', print_section("Perl utilities"),
|
||||
print_table_start(),
|
||||
print_table_header(2, "Name", "Location"),
|
||||
print_table_row(2, add_link('cpan', 'h2ph'), check_path("h2ph")),
|
||||
print_table_row(2, add_link('cpan', 'h2xs'), check_path("h2xs")),
|
||||
print_table_row(2, add_link('cpan', 'perldoc'), check_path("perldoc")),
|
||||
print_table_row(2, add_link('cpan', 'pod2html'), check_path("pod2html")),
|
||||
print_table_row(2, add_link('cpan', 'pod2latex'), check_path("pod2latex")),
|
||||
print_table_row(2, add_link('cpan', 'pod2man'), check_path("pod2man")),
|
||||
print_table_row(2, add_link('cpan', 'pod2text'), check_path("pod2text")),
|
||||
print_table_row(2, add_link('cpan', 'pod2usage'), check_path("pod2usage")),
|
||||
print_table_row(2, add_link('cpan', 'podchecker'), check_path("podchecker")),
|
||||
print_table_row(2, add_link('cpan', 'podselect'), check_path("podselect")),
|
||||
print_table_end(),
|
||||
|
||||
print_section("Mail"),
|
||||
print_table_start(),
|
||||
print_table_row(2, 'SMTP', hostname()),
|
||||
print_table_row(2, 'sendmail_path', check_path("sendmail")),
|
||||
print_table_end(),
|
||||
|
||||
print_httpd(),
|
||||
|
||||
print_section("HTTP Headers Information"),
|
||||
print_table_start(),
|
||||
print_table_colspan_header(2, "HTTP Request Headers");
|
||||
if (defined($ENV{'SERVER_SOFTWARE'})) {
|
||||
$html .= join '',
|
||||
print_table_row(2, 'HTTP Request', "$ENV{'REQUEST_METHOD'} @{[File::Spec->abs2rel($0)]} $ENV{'SERVER_PROTOCOL'}"),
|
||||
print_table_row(2, 'Host', $ENV{'HTTP_HOST'}),
|
||||
print_table_row(2, 'User-Agent', $ENV{'HTTP_USER_AGENT'}),
|
||||
print_table_row(2, 'Accept', $ENV{'HTTP_ACCEPT_ENCODING'}),
|
||||
print_table_row(2, 'Accept-Language', $ENV{'HTTP_ACCEPT_LANGUAGE'}),
|
||||
print_table_row(2, 'Accept-Charset', $ENV{'HTTP_ACCEPT_CHARSET'}),
|
||||
print_table_row(2, 'Keep-Alive', $ENV{'HTTP_KEEP_ALIVE'}),
|
||||
print_table_row(2, 'Connection', $ENV{'HTTP_CONNECTION'});
|
||||
}
|
||||
$html .= print_table_end();
|
||||
return $html;
|
||||
}
|
||||
1;
|
||||
|
||||
147
database/perl/lib/HTML/Perlinfo/HTML.pod
Normal file
147
database/perl/lib/HTML/Perlinfo/HTML.pod
Normal file
@@ -0,0 +1,147 @@
|
||||
=pod
|
||||
|
||||
=head1 NAME
|
||||
|
||||
HTML::Perlinfo::HTML - HTML documentation for the perlinfo library
|
||||
|
||||
=head1 SUMMARY
|
||||
|
||||
HTML::Perlinfo validates as XHTML 1.0 Transitional.
|
||||
|
||||
In the perlinfo library, L<HTML::Perlinfo> and L<HTML::Perlinfo::Modules> use the internal module HTML::Perlinfo::Common for HTML generation. This document provides information on that HTML and its manipulation.
|
||||
|
||||
=head1 CUSTOMIZING THE HTML
|
||||
|
||||
You can capture the HTML output by assigning it to a scalar. Then you can alter the HTML before printing it or doing something else with it. Here is an example that uses the perlinfo function from L<HTML::Perlinfo>:
|
||||
|
||||
use HTML::Perlinfo;
|
||||
|
||||
my $example = perlinfo(); # Now I can do whatever I want with $example
|
||||
$example =~ s/Perl/Java/ig; # Make everyone laugh
|
||||
print $example;
|
||||
|
||||
Another option is to use object attributes which make altering some HTML elements less helter skelter.
|
||||
|
||||
=head1 OBJECT ATTRIBUTES
|
||||
|
||||
These object attributes allow you to change the HTML CSS settings to achieve a stylish effect. Please see your favorite HTML guide for acceptable CSS values. Refer to the HTML source code of the perlinfo page for the defaults.
|
||||
|
||||
Attribute name/Corresponding CSS element
|
||||
|
||||
title / page title (only non-CSS element)
|
||||
bg_image / background_image
|
||||
bg_position / background_position
|
||||
bg_repeat / background_repeat
|
||||
bg_attribute / background_attribute
|
||||
bg_color / background_color
|
||||
ft_family / font_familty
|
||||
ft_color / font_color
|
||||
lk_color / link color
|
||||
lk_decoration / link text-decoration
|
||||
lk_bgcolor / link background-color
|
||||
lk_hvdecoration / link hover text-decoration
|
||||
header_bgcolor / table header background-color
|
||||
header_ftcolor / table header font color
|
||||
leftcol_bgcolor / background-color of leftmost table cell
|
||||
leftcol_ftcolor / font color of left table cell
|
||||
rightcol_bgcolor / background-color of right table cell
|
||||
rightcol_ftcolor / font color of right table cell
|
||||
|
||||
=head2 CSS EXAMPLE
|
||||
|
||||
$p = HTML::Perlinfo->new(
|
||||
bg_image => 'http://i104.photobucket.com/albums/m176/perlinfo/camel.gif',
|
||||
bg_repeat => 'yes-repeat'
|
||||
);
|
||||
$p->info_all;
|
||||
|
||||
=head1 print_htmlhead
|
||||
|
||||
This method prints the head container tags containing the style sheet, along with a few other html tags. It is useful to call this method when full_page is set to 0 and you are piecing together multiple perlinfo pages into one page. For example:
|
||||
|
||||
$m = HTML::Perlinfo::Modules->new( full_page => 0 ); # Just the bare essentials please
|
||||
|
||||
$m->print_htmlhead; # Print the beginning of an html document
|
||||
|
||||
$m->print_modules( from =>'/home/paco',
|
||||
section => 'The Modules in Paco's Home Directory'
|
||||
);
|
||||
|
||||
$m->print_modules( from =>'/home/cowboy',
|
||||
section => 'The Modules in Cowboy's Home Directory'
|
||||
);
|
||||
|
||||
When full_page is set to 1 (the default value), print_htmlhead is called internally. Note that you can still set CSS values in the constructor even when full_page is set to 0 and see the results in print_htmlhead.
|
||||
|
||||
$m = HTML::Perlinfo::Modules->new(
|
||||
full_page => 0,
|
||||
bg_color => 'gray'
|
||||
);
|
||||
|
||||
$m->print_htmlhead; # Prints a HTML document with a gray background
|
||||
|
||||
Of course, you don't have to use the print_htmlhead method. You could insert your own HTML with your own style sheet when you set full_page to 0.
|
||||
|
||||
=head1 full_page
|
||||
|
||||
Do you want only a fragment of HTML and not a page with body tags (among other things)? Then the full_page option is what you need to use (or a regular expression, as explained above). This option allows you to add your own header/footer if you so desire. By default, the value is 1. Set it to 0 to output the HTML report with as little HTML as possible.
|
||||
|
||||
$p = HTML::Perlinfo->new( full_page => 0 ); # Change value to 1 to get a full HTML page
|
||||
|
||||
=head1 links
|
||||
|
||||
By default, there will be useful links in most of the presented HTML in the perlinfo library. These links are for pages on search.cpan.org. Even the info_config method lists links to the config options in the core Config module.
|
||||
|
||||
To manipulate links in the perlinfo library, you can use the links attribute in the info methods. Not to be confused with the "link" attribute in the HTML::Perlinfo::Module (which allows you to provide your own links for modules), this attribute's primary purpose is to turn on linking or turn it off. Of course, you can achieve the same effect by using regular expressions, as explained above. But using the links attribute makes your code cleaner.
|
||||
|
||||
There are several arguments (in an array reference) you can supply to the links attribute.
|
||||
|
||||
The number 1 turns on all default links and 0 will remove them.
|
||||
|
||||
For example, to remove the default links in the info_all method, you would say:
|
||||
|
||||
$p->info_all( links=>[0] ); # contains no links. Good for printing!
|
||||
|
||||
The example above removes all default links and it even ignores the L<link|HTML::Perlinfo::Modules#link> parameter in the print_modules method of L<HTML::Perlinfo::Modules>.
|
||||
|
||||
The named parameters for the links attribute are 'docs' and 'local' which controls links associated with modules, programs in the Perl utilities section, and the Config section and everywhere else. The value for either parameter can be either '1' or '0'.
|
||||
|
||||
=over
|
||||
|
||||
=item docs
|
||||
|
||||
Using 'docs', you can control the display of the default links to module and program documentation on search.cpan.org. But the link parameter in L<HTML::Perlinfo::Modules> can override this directive. By overridding 'docs=>0', you can show documentation for certain modules and not show documentation for any others. This is useful, for example, when you have homegrown modules without any documentation but want to show links to documentation for CPAN modules on the same page. Observe:
|
||||
|
||||
$p->print_modules( links => [docs=>0], link => [qr/Apache::/, 'http://www.myexample.com/perldoc/'] );
|
||||
|
||||
In the above example, only links to Apache modules would appear. Other modules would not have links to any documentation. Note that had you simply set the value for links to zero, then the other attribute concerning Apache modules would have been irrelevant, since no links whatsoever would have appeared. In other words, you can mix and match these two atttibutes to achieve many different and wonderous effects. Have fun! Be imaginative!
|
||||
|
||||
For more information on print_modules and its link parameter, please see L<HTML::Perlinfo::Modules>.
|
||||
|
||||
=item local
|
||||
|
||||
With the 'local' parameter set to 1, the local location of a module or program will be a link. This is useful if you want to see the local installation directory of a module in your browser. (From there, you could also look at the contents of said files.)
|
||||
|
||||
Note that this link would only work if you use the perlinfo library from the command-line and then view the resulting page on the same machine. Hence these local links are not present by default.
|
||||
|
||||
You can even use 'docs' along with 'local'.
|
||||
|
||||
$p->info_all( links => [docs=>0,local=>1] )
|
||||
|
||||
=back
|
||||
|
||||
=head1 NOTES
|
||||
|
||||
L<HTML::Perlinfo::Modules> allows you to color code specific modules.
|
||||
|
||||
More HTML options should be available in future revisions. Want to see a new feature/change? Then contact me about it.
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Mike Accardo <mikeaccardo@yahoo.com>
|
||||
|
||||
=head1 COPYRIGHT
|
||||
|
||||
Copyright (c) 2009, Mike Accardo. All Rights Reserved.
|
||||
|
||||
=cut
|
||||
66
database/perl/lib/HTML/Perlinfo/Loaded.pm
Normal file
66
database/perl/lib/HTML/Perlinfo/Loaded.pm
Normal file
@@ -0,0 +1,66 @@
|
||||
package HTML::Perlinfo::Loaded;
|
||||
BEGIN { %Seen = %INC }
|
||||
|
||||
use HTML::Perlinfo::Modules;
|
||||
|
||||
$VERSION = '1.02';
|
||||
|
||||
%INC = %Seen;
|
||||
END {
|
||||
|
||||
delete $INC{'HTML/Perlinfo/Loaded.pm'};
|
||||
my $m = HTML::Perlinfo::Modules->new(full_page=>0, title=>'perlinfo(INFO_LOADED)');
|
||||
$m->print_htmlhead;
|
||||
$m->print_modules('files_in'=>[values %INC],'section'=>'Loaded Modules');
|
||||
print $m->info_variables,"</div></body></html>";
|
||||
}
|
||||
|
||||
1;
|
||||
__END__
|
||||
=pod
|
||||
|
||||
=head1 NAME
|
||||
|
||||
HTML::Perlinfo::Loaded - Post-execution HTML dump of loaded modules and environment variables
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
#!/usr/bin/perl
|
||||
use HTML::Perlinfo::Loaded;
|
||||
...
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
This module installs an at-exit handler to generate an HTML dump of all the module files used by a Perl program. As an added bonus, environment variables are also included in this dump. When used under mod_perl, the module will show you preloaded modules in the HTML page too.
|
||||
|
||||
Since the "dump" is a complete HTML page, this module is a good debugging tool for Web applications. Just make sure you print the content-type header beforehand or you will get an internal server error (malformed header).
|
||||
|
||||
Note that the HTML::Perlinfo function 'perlinfo' has an option called INFO_LOADED that will produce the same result. In other words, there is more than one way to do it! Observe:
|
||||
|
||||
use HTML::Perlinfo;
|
||||
|
||||
perlinfo(INFO_LOADED);
|
||||
|
||||
The result will be the same if you say:
|
||||
|
||||
#!/usr/bin/perl
|
||||
use HTML::Perlinfo::Loaded;
|
||||
...
|
||||
|
||||
There is no difference, except using the perlinfo option gives you greater control. You could always control HTML::Perlinfo::Loaded with a pound sign (a comment on/off), but if you are using mod_perl it makes more sense to add HTML::Perlinfo to your startup file and then call perlinfo(INFO_LOADED) when you want to dump.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<Devel::Loaded>, L<HTML::Perlinfo::Modules>, L<HTML::Perlinfo>, L<perlinfo>
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Mike Accardo <mikeaccardo@yahoo.com>
|
||||
|
||||
=head1 COPYRIGHT
|
||||
|
||||
Copyright (c) 2008, Mike Accardo. All Rights Reserved.
|
||||
This module is free software. It may be used, redistributed
|
||||
and/or modified under the terms of the Perl Artistic License.
|
||||
|
||||
=cut
|
||||
753
database/perl/lib/HTML/Perlinfo/Modules.pm
Normal file
753
database/perl/lib/HTML/Perlinfo/Modules.pm
Normal file
@@ -0,0 +1,753 @@
|
||||
package HTML::Perlinfo::Modules;
|
||||
|
||||
use strict;
|
||||
use File::Find;
|
||||
use File::Spec;
|
||||
use Carp ();
|
||||
use Config qw(%Config);
|
||||
use base qw(HTML::Perlinfo::Base);
|
||||
use CGI qw(escapeHTML);
|
||||
use HTML::Perlinfo::Common;
|
||||
|
||||
our $VERSION = '1.17';
|
||||
|
||||
|
||||
sub new {
|
||||
|
||||
my ($class, %params) = @_;
|
||||
$params{'title'} = exists $params{'title'} ? $params{'title'} : 'Perl Modules';
|
||||
|
||||
$class->SUPER::new(%params);
|
||||
|
||||
}
|
||||
|
||||
|
||||
sub module_color_check {
|
||||
|
||||
my ($module_name, $color_specs) = @_;
|
||||
if (defined $color_specs && ref($color_specs->[0]) eq 'ARRAY') {
|
||||
foreach (@{ $color_specs }) {
|
||||
return $_->[0] if (match_string($module_name,$_->[1])==1);
|
||||
}
|
||||
}
|
||||
else {
|
||||
return $color_specs->[0] if (defined $color_specs && match_string($module_name,$color_specs->[1])==1);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
# get_modinfo
|
||||
# This sub was created for the files_in option.
|
||||
# Returns found_mod reference
|
||||
######################################
|
||||
|
||||
sub get_files_in {
|
||||
|
||||
my ($file_path) = @_;
|
||||
|
||||
return 0 unless $file_path =~ m/\.pm$/;
|
||||
my $mod_info = module_info($file_path, undef);
|
||||
return $mod_info;
|
||||
|
||||
}
|
||||
|
||||
|
||||
sub sort_modules {
|
||||
|
||||
my ($modules, $sort_by) = @_;
|
||||
my @sorted_modules;
|
||||
|
||||
if ($sort_by eq 'name') {
|
||||
foreach my $key (sort {lc $a cmp lc $b} keys %$modules) {
|
||||
# Check for duplicate modules
|
||||
if (ref($modules->{$key}) eq 'ARRAY') {
|
||||
foreach (@{ $modules->{$key} }) {
|
||||
push @sorted_modules, $_;
|
||||
}
|
||||
}
|
||||
else {
|
||||
push @sorted_modules, $modules->{$key};
|
||||
}
|
||||
}
|
||||
}
|
||||
elsif ($sort_by eq 'version') {
|
||||
foreach my $key (keys %$modules) {
|
||||
if (ref($modules->{$key}) eq 'ARRAY') {
|
||||
@{ $modules->{$key} } = sort {$a->{'version'} cmp $b->{'version'}}@{ $modules->{$key} };
|
||||
for (@{ $modules->{$key}}) {
|
||||
push @sorted_modules, $_;
|
||||
}
|
||||
}
|
||||
else {
|
||||
push @sorted_modules, $modules->{$key};
|
||||
}
|
||||
}
|
||||
@sorted_modules = sort {$a->{'version'} cmp $b->{'version'}}@sorted_modules;
|
||||
}
|
||||
return @sorted_modules;
|
||||
}
|
||||
|
||||
sub html_setup {
|
||||
|
||||
my ($self, $columns, $color_specs, $section, $full_page) = @_;
|
||||
|
||||
my $html;
|
||||
|
||||
$html .= $self->print_htmlhead if $full_page;
|
||||
|
||||
my %show_columns = (
|
||||
'name' => 'Module name',
|
||||
'version' => 'Version',
|
||||
'path' => 'Location',
|
||||
'core' => 'Core',
|
||||
'desc' => 'Description'
|
||||
);
|
||||
|
||||
$html .= $section ? print_section($section) : '';
|
||||
$html .= print_color_codes($color_specs) if $color_specs && $color_specs->[2];
|
||||
$html .= print_table_start();
|
||||
$html .= print_table_header(scalar @$columns, map{ $show_columns{$_} }@$columns);
|
||||
return $html;
|
||||
}
|
||||
|
||||
sub module_info {
|
||||
my ($module_path, $show_only) = @_;
|
||||
|
||||
( $module_path ) = $module_path =~ /^(.*)$/;
|
||||
|
||||
my ($mod_name, $mod_version, $mod_desc);
|
||||
|
||||
no warnings 'all'; # silence warnings
|
||||
open(MOD, $module_path) or return 0;
|
||||
while (<MOD>) {
|
||||
|
||||
unless ($mod_name) {
|
||||
if (/^ *package +(\S+);/) {
|
||||
$mod_name = $1;
|
||||
}
|
||||
}
|
||||
|
||||
unless ($mod_version) {
|
||||
|
||||
if (/([\$*])(([\w\:\']*)\bVERSION)\b.*\=/) {
|
||||
|
||||
my $line = substr $_, index($_, $1);
|
||||
my $eval = qq{
|
||||
package HTML::Perlinfo::_version;
|
||||
no strict;
|
||||
|
||||
local $1$2;
|
||||
\$$2=undef; do {
|
||||
$line
|
||||
}; \$$2
|
||||
};
|
||||
|
||||
( $eval ) = $eval =~ /^(.*)$/sm;
|
||||
$mod_version = eval($eval);
|
||||
# Again let us be nice here.
|
||||
$mod_version = '<i>unknown</i>' if (not defined $mod_version) || ($@);
|
||||
$mod_version =~ s/^\s+|\s+$//;
|
||||
}
|
||||
}
|
||||
|
||||
unless ($mod_desc) {
|
||||
if (/=head\d\s+NAME/) {
|
||||
local $/ = '';
|
||||
local $_;
|
||||
chomp($_ = <MOD>);
|
||||
($mod_desc) = /^.*?-+\s*(.*?)$/ism;
|
||||
}
|
||||
}
|
||||
|
||||
last if $mod_name && $mod_version && $mod_desc;
|
||||
|
||||
}
|
||||
|
||||
close (MOD);
|
||||
return 0 if (! $mod_name || $show_only && ref $show_only && (match_string($mod_name, $show_only) == 0));
|
||||
$mod_version = '<i>unknown</i>' if !($mod_version) || ($mod_version !~ /^[\.\d+_]+$/);
|
||||
$mod_desc = escapeHTML($mod_desc) if $mod_desc;
|
||||
$mod_desc = "<i>No description found</i>" unless $mod_desc;
|
||||
return { 'name' => $mod_name, 'version' => $mod_version, 'desc' => $mod_desc };
|
||||
}
|
||||
|
||||
sub print_color_codes {
|
||||
my $color_specs = shift;
|
||||
my ($html, $label);
|
||||
$html .= print_table_start();
|
||||
$html .= print_table_header(1, "Module Color Codes");
|
||||
$html .= print_table_color_start();
|
||||
|
||||
if (ref($color_specs->[0]) eq 'ARRAY') {
|
||||
my $count = 0;
|
||||
foreach (@{ $color_specs }) {
|
||||
$html .= "<tr>" if $count++ % 5 == 0;
|
||||
$label = $_->[2] || $_->[1];
|
||||
$html .= print_color_box($_->[0], $label);
|
||||
$html .= "</tr>" if (($count >= 5 && $count % 5 == 0)||($count >= @{$color_specs}));
|
||||
}
|
||||
}
|
||||
else {
|
||||
$label = $color_specs->[2] || $color_specs->[1];
|
||||
$html .= print_color_box($color_specs->[0], $label);
|
||||
}
|
||||
|
||||
$html .= print_table_color_end();
|
||||
$html .= print_table_end();
|
||||
return $html;
|
||||
}
|
||||
|
||||
sub print_module_results {
|
||||
|
||||
my ($mod_dir, $mod_count, $from, $overall_total, $show_dir) = @_;
|
||||
|
||||
my ($html, $total_amount, $searched, @mod_dir, @bad_dir, %seen);
|
||||
|
||||
if ($show_dir) {
|
||||
|
||||
$html .= print_table_start();
|
||||
$html .= print_table_header(2, "Directory", "Number of Modules");
|
||||
for my $dir (keys %{$mod_count}) {
|
||||
my $amount_found = $mod_count->{$dir};
|
||||
push (@mod_dir, $dir) if $amount_found;
|
||||
}
|
||||
|
||||
for my $dir1 (@mod_dir) {
|
||||
for my $dir2 (@mod_dir) {
|
||||
if ($dir1 ne $dir2 && $dir2 =~ /^$dir1/) {
|
||||
push @bad_dir, $dir2;
|
||||
}
|
||||
}
|
||||
}
|
||||
for my $top_dir (@mod_dir) {
|
||||
unless (grep{$_ eq $top_dir }@bad_dir) {
|
||||
$html .= print_table_row(2, add_link('local', File::Spec->canonpath($top_dir)), $mod_count->{$top_dir});
|
||||
}
|
||||
}
|
||||
$html .= print_table_end();
|
||||
}
|
||||
else {
|
||||
# Print out directories not in @INC
|
||||
@mod_dir = grep { -d $_ && -r $_ && !$seen{$_}++ } map {File::Spec->canonpath($_)}@INC;
|
||||
my @module_paths = grep { not exists $seen{$_} }@$mod_dir;
|
||||
|
||||
if (@module_paths >= 1) {
|
||||
$html .= print_table_start();
|
||||
$html .= print_table_header(3, "Directory", "Searched", "Number of Modules");
|
||||
|
||||
for my $dir (map{ File::Spec->canonpath($_) }@module_paths) {
|
||||
$searched = (grep { $_ eq $dir } @$mod_dir) ? "yes" : "no";
|
||||
my $amount_found = ($searched eq 'yes') ? $mod_count->{$dir} : '<i>unknown</i>';
|
||||
$html .= print_table_row(3, add_link('local', File::Spec->canonpath($dir)), $searched, $amount_found);
|
||||
}
|
||||
$html .= print_table_end();
|
||||
}
|
||||
|
||||
|
||||
$html .= print_table_start();
|
||||
$html .= print_table_header(3, "Include path (INC) directories", "Searched", "Number of Modules");
|
||||
for my $dir (@mod_dir) {
|
||||
$searched = exists $mod_count->{$dir} ? 'yes' : 'no';
|
||||
my $amount_found = ($searched eq 'yes') ? $mod_count->{$dir} : '<i>unknown</i>';
|
||||
$html .= print_table_row(3, add_link('local', File::Spec->canonpath($dir)), $searched, $amount_found);
|
||||
}
|
||||
|
||||
$html .= print_table_end();
|
||||
}
|
||||
|
||||
$html .= print_table_start();
|
||||
#my $view = ($from eq 'all') ? 'installed' :
|
||||
# ($from eq 'core') ? 'core' : 'found';
|
||||
|
||||
$html .= print_table_row(2, "Total modules", $overall_total);
|
||||
$html .= print_table_end();
|
||||
|
||||
return $html;
|
||||
|
||||
}
|
||||
|
||||
sub search_dir {
|
||||
|
||||
my ($from, $show_only, $core_dir1, $core_dir2) = @_;
|
||||
|
||||
|
||||
my %seen = ();
|
||||
|
||||
my @user_dir = (ref($from) eq 'ARRAY') && $show_only ne 'core' ? @{$from} :
|
||||
($show_only eq 'core') ? ($core_dir1, $core_dir2) : $from;
|
||||
|
||||
# Make sure only unique entries and readable directories in @mod_dir
|
||||
my @mod_dir = grep { -d $_ && -r $_ && !$seen{$_}++ } map {File::Spec->canonpath($_)}@user_dir;
|
||||
if (@mod_dir != @user_dir) {
|
||||
|
||||
# Looks like there might have been a problem with the directories given to us.
|
||||
# Or maybe not. @user_dir could have duplicate values and that's ok.
|
||||
# But let's still warn about any unreadable or non-directories given
|
||||
|
||||
my @debug;
|
||||
%seen = ();
|
||||
@user_dir = grep { !$seen{$_}++ } map {File::Spec->canonpath($_)}@user_dir;
|
||||
if (@user_dir > @mod_dir) {
|
||||
#%seen = map {$_ => undef} @mod_dir;
|
||||
%seen = ();
|
||||
@seen{@mod_dir} = ();
|
||||
my @difference = grep { !$seen{$_}++ }@user_dir;
|
||||
foreach my $element (@difference) {
|
||||
if (! -d $element) {
|
||||
if ( grep {$_ eq $element} map {File::Spec->canonpath($_)}@INC) {
|
||||
warn "$element is in the Perl include path, but is not a directory";
|
||||
}
|
||||
else {
|
||||
warn "$element is not a directory";
|
||||
}
|
||||
push @debug, $element;
|
||||
}
|
||||
elsif (! -r $element) {
|
||||
if ( grep {$_ eq $element} map {File::Spec->canonpath($_)}@INC) {
|
||||
warn "$element is in the Perl include path, but is not readable";
|
||||
}
|
||||
else {
|
||||
warn "$element is not a readable directory";
|
||||
}
|
||||
|
||||
push @debug, $element;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
error_msg("Search directories are invalid") unless @mod_dir >= 1;
|
||||
|
||||
return @mod_dir;
|
||||
}
|
||||
|
||||
sub get_input {
|
||||
|
||||
my $self = shift;
|
||||
my $args = process_args(@_, \&check_module_args);
|
||||
my %input = ();
|
||||
$input{'files_in'} = $args->{'files_in'} || undef;
|
||||
$input{'sort_by'} = $args->{'sort_by'} || 'name';
|
||||
$input{'from'} = $args->{'from'} || \@INC;
|
||||
$input{'show_only'} = $args->{'show_only'} || "";
|
||||
$input{'color_specs'} = $args->{'color'};
|
||||
$input{'link'} = $args->{'link'};
|
||||
$input{'section'} = exists $args->{'section'} ? $args->{'section'} :
|
||||
$input{'show_only'} eq 'core' ? 'Core Perl modules installed' : '';
|
||||
$input{'full_page'} = exists $args->{'full_page'} ? $args->{'full_page'} : $self->{'full_page'};
|
||||
$input{'show_inc'} = exists $args->{'show_inc'} ? $args->{'show_inc'} : 1;
|
||||
$input{'show_dir'} = exists $args->{'show_dir'} ? $args->{'show_dir'} : 0;
|
||||
$input{'columns'} = exists $args->{'columns'} ? $args->{'columns'} : ['name','version','desc'];
|
||||
return %input;
|
||||
}
|
||||
|
||||
sub print_modules {
|
||||
|
||||
my %input = get_input(@_);
|
||||
|
||||
my ($found_mod, $mod_count, $overall_total, @mod_dir, $core_dir1, $core_dir2);
|
||||
|
||||
# Check to see if a search is even needed
|
||||
if (defined $input{'files_in'}) {
|
||||
|
||||
my @files = @{ $input{'files_in'} };
|
||||
my %found_mod = ();
|
||||
|
||||
foreach my $file_path (@files) {
|
||||
|
||||
my $mod_info = get_files_in($file_path);
|
||||
next unless (ref $mod_info eq 'HASH');
|
||||
$found_mod{$mod_info->{'name'}} = $mod_info;
|
||||
}
|
||||
return undef unless (keys %found_mod > 0);
|
||||
$found_mod = \%found_mod;
|
||||
}
|
||||
else {
|
||||
|
||||
# Get ready to search
|
||||
$core_dir1 = File::Spec->canonpath($Config{installarchlib});
|
||||
$core_dir2 = File::Spec->canonpath($Config{installprivlib});
|
||||
|
||||
@mod_dir = search_dir($input{'from'}, $input{'show_only'}, $core_dir1, $core_dir2);
|
||||
|
||||
($overall_total, $found_mod, $mod_count) = find_modules($input{'show_only'}, \@mod_dir);
|
||||
|
||||
return undef unless $overall_total;
|
||||
|
||||
}
|
||||
|
||||
my @sorted_modules = sort_modules($found_mod, $input{'sort_by'});
|
||||
|
||||
my $html .= html_setup( $_[0],
|
||||
$input{'columns'},
|
||||
$input{'color_specs'},
|
||||
$input{'section'},
|
||||
$input{'full_page'}
|
||||
);
|
||||
|
||||
my $numberof_columns = scalar @{$input{'columns'}};
|
||||
|
||||
foreach my $module (@sorted_modules) {
|
||||
|
||||
$html .= print_table_row_color( $numberof_columns,
|
||||
module_color_check($module->{'name'}, $input{'color_specs'}),
|
||||
map{
|
||||
if ($_ eq 'name') {
|
||||
add_link('cpan', $module->{'name'}, $input{'link'});
|
||||
}
|
||||
elsif ($_ eq 'core') {
|
||||
(grep File::Spec->rel2abs($module->{'path'}) =~ /\Q$_/, ($core_dir1, $core_dir2)) ? 'yes' : 'no';
|
||||
}
|
||||
elsif ($_ eq 'path') {
|
||||
add_link('local', $module->{'path'});
|
||||
}
|
||||
else {
|
||||
$module->{$_};
|
||||
}
|
||||
|
||||
} @{$input{'columns'}} );
|
||||
}
|
||||
|
||||
$html .= print_table_end();
|
||||
|
||||
unless (defined $input{'files_in'} && ref $input{'files_in'} eq 'ARRAY') {
|
||||
$html .= print_module_results( \@mod_dir,
|
||||
$mod_count,
|
||||
$input{'from'},
|
||||
$overall_total,
|
||||
$input{'show_dir'}) if $input{'show_inc'};
|
||||
}
|
||||
|
||||
$html .= "</div></body></html>" if $input{'full_page'};
|
||||
|
||||
defined wantarray ? return $html : print $html;
|
||||
|
||||
}
|
||||
|
||||
sub find_modules {
|
||||
|
||||
my ($show_only, $mod_dir) = @_;
|
||||
|
||||
my ($overall_total, $module, $base, $start_dir, $new_val, $mod_info);
|
||||
# arrays
|
||||
my (@modinfo_array, @mod_dir);
|
||||
# hashes
|
||||
my ( %path, %inc_path, %mod_count, %found_mod);
|
||||
@mod_dir = @$mod_dir;
|
||||
|
||||
@path{@mod_dir} = ();
|
||||
@inc_path{@INC} = ();
|
||||
for $base (@mod_dir) {
|
||||
|
||||
find ({ wanted => sub {
|
||||
for (@INC, @mod_dir) {
|
||||
if (index($File::Find::name, $_) == 0) {
|
||||
# lets record it unless we already have hit the dir
|
||||
$mod_count{$_} = 0 unless exists $mod_count{$_};
|
||||
}
|
||||
}
|
||||
# This prevents mod_dir dirs from being searched again when you have a dir within a dir
|
||||
$File::Find::prune = 1, return if exists $path{$File::Find::name} && $File::Find::name ne $File::Find::topdir;
|
||||
|
||||
# make sure we are dealing with a module
|
||||
return unless $File::Find::name =~ m/\.pm$/;
|
||||
$mod_info = module_info($File::Find::name, $show_only);
|
||||
return unless ref ($mod_info) eq 'HASH';
|
||||
|
||||
# update the counts.
|
||||
for (@INC, grep{not exists $inc_path{$_}}@mod_dir) {
|
||||
if (index($File::Find::name, $_) == 0) {
|
||||
$mod_count{$_}++;
|
||||
}
|
||||
}
|
||||
$overall_total++;
|
||||
|
||||
$mod_info->{'path'} = File::Spec->canonpath($File::Find::dir);
|
||||
# Check for duplicate modules
|
||||
if (exists $found_mod{$mod_info->{'name'}}) {
|
||||
@modinfo_array = ref( $found_mod{$mod_info->{'name'}} ) eq 'ARRAY' ? @{$found_mod{$mod_info->{'name'}}} : $found_mod{$mod_info->{'name'}};
|
||||
push @modinfo_array, $mod_info;
|
||||
$new_val = [@modinfo_array];
|
||||
$found_mod{$mod_info->{'name'}} = $new_val;
|
||||
}
|
||||
else {
|
||||
$found_mod{$mod_info->{'name'}} = $mod_info;
|
||||
}
|
||||
|
||||
},untaint => 1, untaint_pattern => qr|^([-+@\s\S\w./]+)$|}, $base);
|
||||
} # end of for loop
|
||||
|
||||
return ($overall_total, \%found_mod, \%mod_count);
|
||||
|
||||
}
|
||||
|
||||
1;
|
||||
__END__
|
||||
=pod
|
||||
|
||||
=head1 NAME
|
||||
|
||||
HTML::Perlinfo::Modules - Display a lot of module information in HTML format
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use HTML::Perlinfo::Modules;
|
||||
|
||||
my $m = HTML::Perlinfo::Modules->new();
|
||||
$m->print_modules;
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
This module outputs information about your Perl modules in HTML. The information includes a module's name, version, description and location. The HTML presents the module information in B<two sections>, one section is a list of modules and the other is a summary of this list. Both the list and its summary are configurable.
|
||||
|
||||
Other information displayed:
|
||||
|
||||
- Duplicate modules. So if you have CGI.pm installed in different locations, these duplicate modules will be shown.
|
||||
|
||||
- Automatic links to module documentation on CPAN (you can also provide your own URLs).
|
||||
|
||||
- The number of modules under each directory.
|
||||
|
||||
You can chose to show 'core' modules or you can search for specific modules. You can also define search paths. HTML::Perlinfo::Modules searches the Perl include path (from @INC) by default. You can also highlight specific modules with different colors.
|
||||
|
||||
=head1 METHODS
|
||||
|
||||
=head2 print_modules
|
||||
|
||||
This is the key method in this module. It accepts optional named parameters that dictate the display of module information. The method returns undefined if no modules were found. This means that you can write code such as:
|
||||
|
||||
my $modules = $m->print_modules(from=>'/home/paco');
|
||||
|
||||
if ($modules) {
|
||||
print $modules;
|
||||
}
|
||||
else {
|
||||
print "No modules are in Paco's home directory!";
|
||||
}
|
||||
|
||||
The code example above will show you the modules in Paco's home directory if any are found. If none are found, the code prints the message in the else block. There is a lot more you can do with the named parameters, but you do not have to use them. For example:
|
||||
|
||||
$m->print_modules;
|
||||
|
||||
# The above line is the equivalent of saying:
|
||||
$m->print_modules(
|
||||
from => \@INC,
|
||||
columns => ['name','version','desc'],
|
||||
sort_by => 'name',
|
||||
show_inc => 1
|
||||
);
|
||||
|
||||
# Alternatively, in this case, you could use a HTML::Perlinfo method to achieve the same result.
|
||||
# Note that HTML::Perlinfo::Modules inherits all of the HTML::Perlinfo methods
|
||||
|
||||
$m->info_modules;
|
||||
|
||||
The optional named parameters for the print_modules method are listed below.
|
||||
|
||||
=head3 from
|
||||
|
||||
Show modules from specific directories.
|
||||
|
||||
This parameter accepts 2 things: a single directory or an array reference (containing directories).
|
||||
|
||||
The default value is the Perl include path. This is equivalent of supplying \@INC as a value. If you want to show all of the modules on your box, you can specify '/' as a value (or the disk drive on Windows).
|
||||
|
||||
=head3 files_in
|
||||
|
||||
If you don't need to search for your files and you already have the B<complete pathnames> to them, then you can use the 'files_in' option which accepts an array reference containing the files you wish to display. One obvious use for this option would be in displaying the contents of the INC hash, which holds the modules used by your Perl module or script:
|
||||
|
||||
$m->print_modules('files_in'=>[values %INC]);
|
||||
|
||||
This is the same technique used by the L<HTML::Perlinfo::Loaded> module which performs a post-execution HTML dump of your loaded modules. See L<HTML::Perlinfo::Loaded> for details.
|
||||
|
||||
=head3 columns
|
||||
|
||||
This parameter allows you to control the table columns in the list of modules. With this parameter, you can dictate which columns will be shown and their order. Examples:
|
||||
|
||||
# Show only module names
|
||||
columns=>['name']
|
||||
|
||||
# Show version numbers before names
|
||||
columns=>['version','name']
|
||||
|
||||
# Default columns are:
|
||||
columns=>['name','version','desc']
|
||||
|
||||
The column parameter accepts an array reference containing strings that represent the column names. Those names are:
|
||||
|
||||
=over
|
||||
|
||||
=item name
|
||||
|
||||
The module name. This value is the namespace in the package declaration. Note that the method for retrieving the module name is not fool proof, since a module file can have multiple package declarations. HTML::Perlinfo::Modules grabs the namespace from the first package declaration that it finds.
|
||||
|
||||
=item version
|
||||
|
||||
The version number. Divines the value of $VERSION.
|
||||
|
||||
=item desc
|
||||
|
||||
The module description. The description is from the POD. Note that some modules don't have POD (or have POD without a description) and, in such cases, the message "No description found" will be shown.
|
||||
|
||||
=item path
|
||||
|
||||
The full path to the module file on disk. Printing out the path is especially useful when you want to learn the locations of duplicate modules.
|
||||
|
||||
B<Note that you can make this path a link.> This is useful if you want to see the local installation directory of a module in your browser. (From there, you could also look at the contents of the files.) Be aware that this link would only work if you use this module from the command-line and then view the resulting page on the same machine. Hence these local links are not present by default. To learn more about local links, please refer to the L<HTML documentation|HTML::Perlinfo::HTML>.
|
||||
|
||||
=item core
|
||||
|
||||
This column value (either 'yes' or 'no') will tell you if the module is core. In other words, it will tell you if the module was included in your Perl distribution. If the value is 'yes', then the module lives in either the installarchlib or the installprivlib directory listed in the config file.
|
||||
|
||||
=back
|
||||
|
||||
=head3 sort_by
|
||||
|
||||
You use this parameter to sort the modules. Values can be either 'version' for version number sorting (in descending order) or 'name' for alphabetical sorting (the default).
|
||||
|
||||
=head3 show_only
|
||||
|
||||
This parameter acts like a filter and only shows you the modules (more specifically, the package names) you request. So if, for example, you wanted to only show modules in the Net namspace, you would use the show_only parameter. It is probably the most useful option available for the print_modules method. With this option, you can use HTML::Perlinfo::Modules as a search engine tool for your local Perl modules. Observe:
|
||||
|
||||
$m->print_modules(
|
||||
show_only => ['MYCOMPANY::'],
|
||||
section => 'My Company's Custom Perl Modules',
|
||||
show_dir => 1
|
||||
);
|
||||
|
||||
The example above will print out every module in the 'MYCOMPANY' namespace in the Perl include path (@INC). The list will be entitled 'My Company's Custom Perl Modules' and because show_dir is set to 1, the list will only show the directories in which these modules were found along with how many are present in each directory.
|
||||
|
||||
You can add namespaces to the array reference:
|
||||
|
||||
$m->print_modules(
|
||||
show_only => ['MYCOMPANY::', 'Apache::'],
|
||||
section => 'My Company's Custom Perl Modules & Apache Modules',
|
||||
show_dir => 1
|
||||
);
|
||||
|
||||
In addition to an array reference, show_only also accepts the word 'core', a value that will show you all of the core Perl modules (in the installarchlib and installprivlib directories from the config file).
|
||||
|
||||
=head3 show_inc
|
||||
|
||||
Whenever you perform a module search, you will see a summary of your search that includes the directories searched and the number of modules found. Whether or not your search encompasses the Perl include path (@INC), you will still see these directories, along with any other directories that were actually searched. If you do not what to see this search summary, you must set show_inc to 0. The default value is 1.
|
||||
|
||||
=head3 show_dir
|
||||
|
||||
The default value is 0. Setting this parameter to 1 will only show you the directories in which your modules were found (along with a summary of how many were found, etc). If you do not want to show a search summary, then you must use the show_inc parameter.
|
||||
|
||||
=head3 color
|
||||
|
||||
This parameter allows you to highlight modules with different colors. Highlighting specific modules is a good way to draw attention to them.
|
||||
|
||||
The parameter value must be an array reference containing at least 2 elements. The first element is the color itself which can be either a hex code like #FFD700 or the name of the color. The second element specifies the module(s) to color. And the third, optional element, in the array reference acts as a label in the color code section. This final element can even be a link if you so desire.
|
||||
|
||||
Examples:
|
||||
|
||||
color => ['red', 'Apache::'],
|
||||
color => ['#FFD700', 'CGI::']
|
||||
|
||||
Alternatively, you can also change the color of the rows, by setting CSS values in the constructor. For example:
|
||||
|
||||
$m = HTML::Perlinfo::Modules->new(
|
||||
leftcol_bgcolor => 'red',
|
||||
rightcol_bgcolor => 'red'
|
||||
);
|
||||
|
||||
$m->print_modules(
|
||||
show_only => 'CGI::',
|
||||
show_inc => 0
|
||||
);
|
||||
|
||||
# This next example does the same thing, but uses the color parameter in the print_modules method
|
||||
|
||||
$m = HTML::Perlinfo::Modules->new();
|
||||
|
||||
$m->print_modules(
|
||||
show_only => ['CGI::'],
|
||||
color => ['red', 'CGI::'],
|
||||
show_inc => 0
|
||||
);
|
||||
|
||||
The above example will yield the same HTML results. So which approach should you use? The CSS approach gives you greater control of the HTML presentation. The color parameter, on the other hand, only affects the row colors in the modules list. You cannot achieve that same effect using CSS. For example:
|
||||
|
||||
$m->print_modules( color => ['red', 'CGI::'], color => ['red', 'Apache::'] );
|
||||
|
||||
The above example will list B<all of the modules> in @INC with CGI modules colored red and Apache modules colored blue.
|
||||
|
||||
For further information on customizing the HTML, including setting CSS values, please refer to the L<HTML documentation|HTML::Perlinfo::HTML>.
|
||||
|
||||
=head3 section
|
||||
|
||||
The section parameter lets you put a heading above the module list. Example:
|
||||
|
||||
$m->print_modules(
|
||||
show_only => ['Apache::'],
|
||||
section => 'Apache/mod_perl modules',
|
||||
show_dir => 1,
|
||||
);
|
||||
|
||||
=head3 full_page
|
||||
|
||||
Do you want only a fragment of HTML and not a page with body tags (among other things)? Then the full_page option is what you need to use (or a regular expression, as explained in the L<HTML documentation|HTML::Perlinfo::HTML>). This option allows you to add your own header/footer if you so desire. By default, the value is 1. Set it to 0 to output the HTML report with as little HTML as possible.
|
||||
|
||||
$m = HTML::Perlinfo::Modules->new( full_page => 0 );
|
||||
# You will still get an HTML page but without CSS settings or body tags
|
||||
$m->print_modules;
|
||||
|
||||
$m->print_modules( full_page => 1 ); # Now you will get the complete, default HTML page.
|
||||
|
||||
Note that the full_page option can be set in either the constructor or the method call. The advantage of setting it in the constructor is that every subsequent method call will have this attribute. (There is no limit to how many times you can call print_modules in a program. If calling the method more than once makes no sense to you, then you need to look at the show_only and from options.) If you set the full_page in the print_modules method, you will override its value in the object.
|
||||
|
||||
=head3 link
|
||||
|
||||
By default, every module is linked to its documentation on search.cpan.org. However some modules, such as custom modules, would not be in CPAN and their link would not show any documentation. With the 'link' parameter you can override the CPAN link with you own URL.
|
||||
|
||||
The parameter value must be an array reference containing two elements. The first element can either be a string specifying the module(s) to link or an array reference containing strings or the word 'all' which will link all the modules in the list. The second element is the root URL. In the link, the module name will come after the URL. So in the example below, the link for the Apache::Status module would be 'http://www.myexample.com/perldoc/Apache::Status'.
|
||||
|
||||
link => ['Apache::', 'http://www.myexample.com/perldoc/']
|
||||
|
||||
# Another example
|
||||
my $module = HTML::Perlinfo::Modules
|
||||
->new
|
||||
->print_modules( show_only => ['CGI::','File::','HTML::'],
|
||||
link => ['HTML::', 'http://www.html-example.com/perldoc/'],
|
||||
link => [['CGI::','File::'], 'http://www.examples.com/perldoc/'] );
|
||||
|
||||
|
||||
Further information about linking is in the L<HTML documentation|HTML::Perlinfo::HTML>.
|
||||
|
||||
=head1 CUSTOMIZING THE HTML
|
||||
|
||||
HTML::Perlinfo::Modules uses the same HTML generation as its parent module, HTML::Perlinfo.
|
||||
|
||||
You can capture the HTML output and manipulate it or you can alter CSS elements with object attributes.
|
||||
|
||||
(Note that you can also highlight certain modules with the color parameter to print_modules.)
|
||||
|
||||
For further details and examples, please see the L<HTML documentation|HTML::Perlinfo::HTML> in the HTML::Perlinfo distribution.
|
||||
|
||||
=head1 BUGS
|
||||
|
||||
Please report any bugs or feature requests to C<bug-html-perlinfo@rt.cpan.org>, or through the web interface at
|
||||
L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=HTML-Perlinfo>.
|
||||
I will be notified, and then you'll automatically be notified of progress on your bug as I make changes.
|
||||
|
||||
=head1 NOTES
|
||||
|
||||
If you decide to use this module in a CGI script, make sure you print out the content-type header beforehand.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<HTML::Perlinfo::Loaded>, L<HTML::Perlinfo>, L<perlinfo>, L<Module::Info>, L<Module::CoreList>.
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Mike Accardo <mikeaccardo@yahoo.com>
|
||||
|
||||
=head1 COPYRIGHT
|
||||
|
||||
Copyright (c) 2006-8, Mike Accardo. All Rights Reserved.
|
||||
This module is free software. It may be used, redistributed
|
||||
and/or modified under the terms of the Perl Artistic License.
|
||||
|
||||
=cut
|
||||
209
database/perl/lib/HTML/PullParser.pm
Normal file
209
database/perl/lib/HTML/PullParser.pm
Normal file
@@ -0,0 +1,209 @@
|
||||
package HTML::PullParser;
|
||||
|
||||
require HTML::Parser;
|
||||
@ISA=qw(HTML::Parser);
|
||||
$VERSION = "3.57";
|
||||
|
||||
use strict;
|
||||
use Carp ();
|
||||
|
||||
sub new
|
||||
{
|
||||
my($class, %cnf) = @_;
|
||||
|
||||
# Construct argspecs for the various events
|
||||
my %argspec;
|
||||
for (qw(start end text declaration comment process default)) {
|
||||
my $tmp = delete $cnf{$_};
|
||||
next unless defined $tmp;
|
||||
$argspec{$_} = $tmp;
|
||||
}
|
||||
Carp::croak("Info not collected for any events")
|
||||
unless %argspec;
|
||||
|
||||
my $file = delete $cnf{file};
|
||||
my $doc = delete $cnf{doc};
|
||||
Carp::croak("Can't parse from both 'doc' and 'file' at the same time")
|
||||
if defined($file) && defined($doc);
|
||||
Carp::croak("No 'doc' or 'file' given to parse from")
|
||||
unless defined($file) || defined($doc);
|
||||
|
||||
# Create object
|
||||
$cnf{api_version} = 3;
|
||||
my $self = $class->SUPER::new(%cnf);
|
||||
|
||||
my $accum = $self->{pullparser_accum} = [];
|
||||
while (my($event, $argspec) = each %argspec) {
|
||||
$self->SUPER::handler($event => $accum, $argspec);
|
||||
}
|
||||
|
||||
if (defined $doc) {
|
||||
$self->{pullparser_str_ref} = ref($doc) ? $doc : \$doc;
|
||||
$self->{pullparser_str_pos} = 0;
|
||||
}
|
||||
else {
|
||||
if (!ref($file) && ref(\$file) ne "GLOB") {
|
||||
require IO::File;
|
||||
$file = IO::File->new($file, "r") || return;
|
||||
}
|
||||
|
||||
$self->{pullparser_file} = $file;
|
||||
}
|
||||
$self;
|
||||
}
|
||||
|
||||
|
||||
sub handler
|
||||
{
|
||||
Carp::croak("Can't set handlers for HTML::PullParser");
|
||||
}
|
||||
|
||||
|
||||
sub get_token
|
||||
{
|
||||
my $self = shift;
|
||||
while (!@{$self->{pullparser_accum}} && !$self->{pullparser_eof}) {
|
||||
if (my $f = $self->{pullparser_file}) {
|
||||
# must try to parse more from the file
|
||||
my $buf;
|
||||
if (read($f, $buf, 512)) {
|
||||
$self->parse($buf);
|
||||
} else {
|
||||
$self->eof;
|
||||
$self->{pullparser_eof}++;
|
||||
delete $self->{pullparser_file};
|
||||
}
|
||||
}
|
||||
elsif (my $sref = $self->{pullparser_str_ref}) {
|
||||
# must try to parse more from the scalar
|
||||
my $pos = $self->{pullparser_str_pos};
|
||||
my $chunk = substr($$sref, $pos, 512);
|
||||
$self->parse($chunk);
|
||||
$pos += length($chunk);
|
||||
if ($pos < length($$sref)) {
|
||||
$self->{pullparser_str_pos} = $pos;
|
||||
}
|
||||
else {
|
||||
$self->eof;
|
||||
$self->{pullparser_eof}++;
|
||||
delete $self->{pullparser_str_ref};
|
||||
delete $self->{pullparser_str_pos};
|
||||
}
|
||||
}
|
||||
else {
|
||||
die;
|
||||
}
|
||||
}
|
||||
shift @{$self->{pullparser_accum}};
|
||||
}
|
||||
|
||||
|
||||
sub unget_token
|
||||
{
|
||||
my $self = shift;
|
||||
unshift @{$self->{pullparser_accum}}, @_;
|
||||
$self;
|
||||
}
|
||||
|
||||
1;
|
||||
|
||||
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
HTML::PullParser - Alternative HTML::Parser interface
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use HTML::PullParser;
|
||||
|
||||
$p = HTML::PullParser->new(file => "index.html",
|
||||
start => 'event, tagname, @attr',
|
||||
end => 'event, tagname',
|
||||
ignore_elements => [qw(script style)],
|
||||
) || die "Can't open: $!";
|
||||
while (my $token = $p->get_token) {
|
||||
#...do something with $token
|
||||
}
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
The HTML::PullParser is an alternative interface to the HTML::Parser class.
|
||||
It basically turns the HTML::Parser inside out. You associate a file
|
||||
(or any IO::Handle object or string) with the parser at construction time and
|
||||
then repeatedly call $parser->get_token to obtain the tags and text
|
||||
found in the parsed document.
|
||||
|
||||
The following methods are provided:
|
||||
|
||||
=over 4
|
||||
|
||||
=item $p = HTML::PullParser->new( file => $file, %options )
|
||||
|
||||
=item $p = HTML::PullParser->new( doc => \$doc, %options )
|
||||
|
||||
A C<HTML::PullParser> can be made to parse from either a file or a
|
||||
literal document based on whether the C<file> or C<doc> option is
|
||||
passed to the parser's constructor.
|
||||
|
||||
The C<file> passed in can either be a file name or a file handle
|
||||
object. If a file name is passed, and it can't be opened for reading,
|
||||
then the constructor will return an undefined value and $! will tell
|
||||
you why it failed. Otherwise the argument is taken to be some object
|
||||
that the C<HTML::PullParser> can read() from when it needs more data.
|
||||
The stream will be read() until EOF, but not closed.
|
||||
|
||||
A C<doc> can be passed plain or as a reference
|
||||
to a scalar. If a reference is passed then the value of this scalar
|
||||
should not be changed before all tokens have been extracted.
|
||||
|
||||
Next the information to be returned for the different token types must
|
||||
be set up. This is done by simply associating an argspec (as defined
|
||||
in L<HTML::Parser>) with the events you have an interest in. For
|
||||
instance, if you want C<start> tokens to be reported as the string
|
||||
C<'S'> followed by the tagname and the attributes you might pass an
|
||||
C<start>-option like this:
|
||||
|
||||
$p = HTML::PullParser->new(
|
||||
doc => $document_to_parse,
|
||||
start => '"S", tagname, @attr',
|
||||
end => '"E", tagname',
|
||||
);
|
||||
|
||||
At last other C<HTML::Parser> options, like C<ignore_tags>, and
|
||||
C<unbroken_text>, can be passed in. Note that you should not use the
|
||||
I<event>_h options to set up parser handlers. That would confuse the
|
||||
inner logic of C<HTML::PullParser>.
|
||||
|
||||
=item $token = $p->get_token
|
||||
|
||||
This method will return the next I<token> found in the HTML document,
|
||||
or C<undef> at the end of the document. The token is returned as an
|
||||
array reference. The content of this array match the argspec set up
|
||||
during C<HTML::PullParser> construction.
|
||||
|
||||
=item $p->unget_token( @tokens )
|
||||
|
||||
If you find out you have read too many tokens you can push them back,
|
||||
so that they are returned again the next time $p->get_token is called.
|
||||
|
||||
=back
|
||||
|
||||
=head1 EXAMPLES
|
||||
|
||||
The 'eg/hform' script shows how we might parse the form section of
|
||||
HTML::Documents using HTML::PullParser.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<HTML::Parser>, L<HTML::TokeParser>
|
||||
|
||||
=head1 COPYRIGHT
|
||||
|
||||
Copyright 1998-2001 Gisle Aas.
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the same terms as Perl itself.
|
||||
|
||||
=cut
|
||||
453
database/perl/lib/HTML/SimpleParse.pm
Normal file
453
database/perl/lib/HTML/SimpleParse.pm
Normal file
@@ -0,0 +1,453 @@
|
||||
package HTML::SimpleParse;
|
||||
|
||||
use strict;
|
||||
use vars qw($VERSION $FIX_CASE);
|
||||
|
||||
$VERSION = '0.12';
|
||||
my $debug = 0;
|
||||
|
||||
sub new {
|
||||
my $pack = shift;
|
||||
|
||||
my $self = bless {
|
||||
'text' => shift(),
|
||||
'tree' => [],
|
||||
@_
|
||||
}, $pack;
|
||||
|
||||
$self->parse() if defined $self->{'text'} and length $self->{'text'};
|
||||
return $self;
|
||||
}
|
||||
|
||||
sub text {
|
||||
my $self = shift;
|
||||
$self->{'text'} = shift if @_;
|
||||
return $self->{'text'};
|
||||
}
|
||||
|
||||
sub tree { @{$_[0]->{'tree'}} }
|
||||
|
||||
sub parse {
|
||||
# Much of this is a dumbed-down version of HTML::Parser::parse.
|
||||
|
||||
my $self = shift;
|
||||
my $text = \ $self->{'text'};
|
||||
my $tree = $self->{'tree'};
|
||||
|
||||
# Parse html text in $$text. The strategy is to remove complete
|
||||
# tokens from the beginning of $$text until we can't decide whether
|
||||
# it is a token or not, or the $$text is empty.
|
||||
|
||||
@$tree = ();
|
||||
while (1) {
|
||||
my ($content, $type);
|
||||
|
||||
# First we try to pull off any plain text (anything before a "<" char)
|
||||
if ($$text =~ /\G([^<]+)/gcs) {
|
||||
$content = $1; $type = 'text';
|
||||
|
||||
# Then, SSI, comments, and markup declarations (usually <!DOCTYPE...>)
|
||||
# ssi: <!--#stuff-->
|
||||
# comment: <!--stuff-->
|
||||
# markup: <!stuff>
|
||||
} elsif ($$text =~ /\G<(!--(\#?).*?--)>/gcs) {
|
||||
$type = ($2 ? 'ssi' : 'comment');
|
||||
$content = $1;
|
||||
|
||||
} elsif ($$text =~ /\G<(!.*?)>/gcs) {
|
||||
$type = 'markup';
|
||||
$content = $1;
|
||||
|
||||
# Then, look for an end tag
|
||||
} elsif ($$text =~ m|\G<(/[a-zA-Z][a-zA-Z0-9\.\-]*\s*)>|gcs) {
|
||||
$content = $1; $type = 'endtag';
|
||||
|
||||
# Then, finally we look for a start tag
|
||||
# We know the first char is <, make sure there's a >
|
||||
} elsif ($$text =~ /\G<(.*?)>/gcs) {
|
||||
$content = $1; $type = 'starttag';
|
||||
|
||||
} else {
|
||||
# the string is exhausted, or there's no > in it.
|
||||
push @$tree, {
|
||||
'content' => substr($$text, pos $$text),
|
||||
'type' => 'text',
|
||||
} unless pos($$text) eq length($$text);
|
||||
last;
|
||||
}
|
||||
|
||||
push @$tree, {
|
||||
'content' => $content,
|
||||
'type' => $type,
|
||||
'offset' => ($type eq 'text' ?
|
||||
pos($$text) - length($content) :
|
||||
pos($$text) - length($content) - 2),
|
||||
};
|
||||
}
|
||||
|
||||
$self;
|
||||
}
|
||||
|
||||
|
||||
$FIX_CASE = 1;
|
||||
sub parse_args {
|
||||
my $self = shift; # Not needed here
|
||||
my $str = shift;
|
||||
my $fix_case = ((ref $self and exists $self->{fix_case}) ? $self->{fix_case} : $FIX_CASE);
|
||||
my @returns;
|
||||
|
||||
# Make sure we start searching at the beginning of the string
|
||||
pos($str) = 0;
|
||||
|
||||
while (1) {
|
||||
next if $str =~ m/\G\s+/gc; # Get rid of leading whitespace
|
||||
|
||||
if ( $str =~ m/\G
|
||||
([\w.-]+)\s*=\s* # the key
|
||||
(?:
|
||||
"([^\"\\]* (?: \\.[^\"\\]* )* )"\s* # quoted string, with possible whitespace inside,
|
||||
| # or
|
||||
'([^\'\\]* (?: \\.[^\'\\]* )* )'\s* # quoted string, with possible whitespace inside,
|
||||
| # or
|
||||
([^\s>]*)\s* # anything else, without whitespace or >
|
||||
)/gcx ) {
|
||||
|
||||
my ($key, $val) = ($1, $+);
|
||||
$val =~ s/\\(.)/$1/gs;
|
||||
push @returns, ($fix_case==1 ? uc($key) : $fix_case==-1 ? lc($key) : $key), $val;
|
||||
|
||||
} elsif ( $str =~ m,\G/?([\w.-]+)\s*,gc ) {
|
||||
push @returns, ($fix_case==1 ? uc($1) : $fix_case==-1 ? lc($1) : $1 ), undef;
|
||||
} else {
|
||||
last;
|
||||
}
|
||||
}
|
||||
|
||||
return @returns;
|
||||
}
|
||||
|
||||
|
||||
sub execute {
|
||||
my $self = shift;
|
||||
my $ref = shift;
|
||||
my $method = "output_$ref->{type}";
|
||||
warn "calling $self->$method(...)" if $debug;
|
||||
return $self->$method($ref->{content});
|
||||
}
|
||||
|
||||
sub get_output {
|
||||
my $self = shift;
|
||||
my ($method, $out) = ('', '');
|
||||
foreach ($self->tree) {
|
||||
$out .= $self->execute($_);
|
||||
}
|
||||
return $out;
|
||||
}
|
||||
|
||||
|
||||
sub output {
|
||||
my $self = shift;
|
||||
my $method;
|
||||
foreach ($self->tree) {
|
||||
print $self->execute($_);
|
||||
}
|
||||
}
|
||||
|
||||
sub output_text { $_[1]; }
|
||||
sub output_comment { "<$_[1]>"; }
|
||||
sub output_endtag { "<$_[1]>"; }
|
||||
sub output_starttag { "<$_[1]>"; }
|
||||
sub output_markup { "<$_[1]>"; }
|
||||
sub output_ssi { "<$_[1]>"; }
|
||||
|
||||
1;
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
HTML::SimpleParse - a bare-bones HTML parser
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use HTML::SimpleParse;
|
||||
|
||||
# Parse the text into a simple tree
|
||||
my $p = new HTML::SimpleParse( $html_text );
|
||||
$p->output; # Output the HTML verbatim
|
||||
|
||||
$p->text( $new_text ); # Give it some new HTML to chew on
|
||||
$p->parse # Parse the new HTML
|
||||
$p->output;
|
||||
|
||||
my %attrs = HTML::SimpleParse->parse_args('A="xx" B=3');
|
||||
# %attrs is now ('A' => 'xx', 'B' => '3')
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
This module is a simple HTML parser. It is similar in concept to HTML::Parser,
|
||||
but it differs from HTML::TreeBuilder in a couple of important ways.
|
||||
|
||||
First, HTML::TreeBuilder knows which tags can contain other tags, which
|
||||
start tags have corresponding end tags, which tags can exist only in
|
||||
the <HEAD> portion of the document, and so forth. HTML::SimpleParse
|
||||
does not know any of these things. It just finds tags and text in the
|
||||
HTML you give it, it does not care about the specific content of these
|
||||
tags (though it does distiguish between different _types_ of tags,
|
||||
such as comments, starting tags like <b>, ending tags like </b>, and
|
||||
so on).
|
||||
|
||||
Second, HTML::SimpleParse does not create a hierarchical tree of HTML content,
|
||||
but rather a simple linear list. It does not pay any attention to balancing
|
||||
start tags with corresponding end tags, or which pairs of tags are inside other
|
||||
pairs of tags.
|
||||
|
||||
Because of these characteristics, you can make a very effective HTML
|
||||
filter by sub-classing HTML::SimpleParse. For example, to remove all comments
|
||||
from HTML:
|
||||
|
||||
package NoComment;
|
||||
use HTML::SimpleParse;
|
||||
@ISA = qw(HTML::SimpleParse);
|
||||
sub output_comment {}
|
||||
|
||||
package main;
|
||||
NoComment->new($some_html)->output;
|
||||
|
||||
Historically, I started the HTML::SimpleParse project in part because
|
||||
of a misunderstanding about HTML::Parser's functionality. Many
|
||||
aspects of these two modules actually overlap. I continue to maintain
|
||||
the HTML::SimpleParse module because people seem to be depending on
|
||||
it, and because beginners sometimes find HTML::SimpleParse to be
|
||||
simpler than HTML::Parser's more powerful interface. People also seem
|
||||
to get a fair amount of usage out of the C<parse_args()> method
|
||||
directly.
|
||||
|
||||
=head2 Methods
|
||||
|
||||
=over 4
|
||||
|
||||
=item * new
|
||||
|
||||
$p = new HTML::SimpleParse( $some_html );
|
||||
|
||||
Creates a new HTML::SimpleParse object. Optionally takes one argument,
|
||||
a string containing some HTML with which to initialize the object. If
|
||||
you give it a non-empty string, the HTML will be parsed into a tree and
|
||||
ready for outputting.
|
||||
|
||||
Can also take a list of attributes, such as
|
||||
|
||||
$p = new HTML::SimpleParse( $some_html, 'fix_case' => -1);
|
||||
|
||||
See the C<parse_args()> method below for an explanation of this attribute.
|
||||
|
||||
=item * text
|
||||
|
||||
$text = $p->text;
|
||||
$p->text( $new_text );
|
||||
|
||||
Get or set the contents of the HTML to be parsed.
|
||||
|
||||
=item * tree
|
||||
|
||||
foreach ($p->tree) { ... }
|
||||
|
||||
Returns a list of all the nodes in the tree, in case you want to step
|
||||
through them manually or something. Each node in the tree is an
|
||||
anonymous hash with (at least) three data members, $node->{type} (is
|
||||
this a comment, a start tag, an end tag, etc.), $node->{content} (all
|
||||
the text between the angle brackets, verbatim), and $node->{offset}
|
||||
(number of bytes from the beginning of the string).
|
||||
|
||||
The possible values of $node->{type} are C<text>, C<starttag>,
|
||||
C<endtag>, C<ssi>, and C<markup>.
|
||||
|
||||
=item * parse
|
||||
|
||||
$p->parse;
|
||||
|
||||
Once an object has been initialized with some text, call $p->parse and
|
||||
a tree will be created. After the tree is created, you can call $p->output.
|
||||
If you feed some text to the new() method, parse will be called automatically
|
||||
during your object's construction.
|
||||
|
||||
=item * parse_args
|
||||
|
||||
%hash = $p->parse_args( $arg_string );
|
||||
|
||||
This routine is handy for parsing the contents of an HTML tag into key=value
|
||||
pairs. For instance:
|
||||
|
||||
$text = 'type=checkbox checked name=flavor value="chocolate or strawberry"';
|
||||
%hash = $p->parse_args( $text );
|
||||
# %hash is ( TYPE=>'checkbox', CHECKED=>undef, NAME=>'flavor',
|
||||
# VALUE=>'chocolate or strawberry' )
|
||||
|
||||
Note that the position of the last m//g search on the string (the value
|
||||
returned by Perl's pos() function) will be altered by the parse_args function,
|
||||
so make sure you take that into account if (in the above example) you do
|
||||
C<$text =~ m/something/g>.
|
||||
|
||||
The parse_args() method can be run as either an object method or as a
|
||||
class method, i.e. as either $p->parse_args(...) or
|
||||
HTML::SimpleParse->parse_args(...).
|
||||
|
||||
HTML attribute lists are supposed to be case-insensitive with respect
|
||||
to attribute names. To achieve this behavior, parse_args() respects
|
||||
the 'fix_case' flag, which can be set either as a package global
|
||||
$FIX_CASE, or as a class member datum 'fix_case'. If set to 0, no
|
||||
case conversion is done. If set to 1, all keys are converted to upper
|
||||
case. If set to -1, all keys are converted to lower case. The
|
||||
default is 1, i.e. all keys are uppercased.
|
||||
|
||||
If an attribute takes no value (like "checked" in the above example) then it
|
||||
will still have an entry in the returned hash, but its value will be C<undef>.
|
||||
For example:
|
||||
|
||||
%hash = $p->parse_args('type=checkbox checked name=banana value=""');
|
||||
# $hash{CHECKED} is undef, but $hash{VALUE} is ""
|
||||
|
||||
This method actually returns a list (not a hash), so duplicate attributes and
|
||||
order will be preserved if you want them to be:
|
||||
|
||||
@hash = $p->parse_args("name=family value=gwen value=mom value=pop");
|
||||
# @hash is qw(NAME family VALUE gwen VALUE mom VALUE pop)
|
||||
|
||||
=item * output
|
||||
|
||||
$p->output;
|
||||
|
||||
This will output the contents of the HTML, passing the real work off to
|
||||
the output_text, output_comment, etc. functions. If you do not override any
|
||||
of these methods, this module will output the exact text that it parsed into
|
||||
a tree in the first place.
|
||||
|
||||
=item * get_output
|
||||
|
||||
print $p->get_output
|
||||
|
||||
Similar to $p->output(), but returns its result instead of printing it.
|
||||
|
||||
=item * execute
|
||||
|
||||
foreach ($p->tree) {
|
||||
print $p->execute($_);
|
||||
}
|
||||
|
||||
Executes a single node in the HTML parse tree. Useful if you want to loop
|
||||
through the nodes and output them individually.
|
||||
|
||||
=back
|
||||
|
||||
The following methods do the actual outputting of the various parts of
|
||||
the HTML. Override some of them if you want to change the way the HTML
|
||||
is output. For instance, to strip comments from the HTML, override the
|
||||
output_comment method like so:
|
||||
|
||||
# In subclass:
|
||||
sub output_comment { } # Does nothing
|
||||
|
||||
=over 4
|
||||
|
||||
=item * output_text
|
||||
|
||||
=item * output_comment
|
||||
|
||||
=item * output_endtag
|
||||
|
||||
=item * output_starttag
|
||||
|
||||
=item * output_markup
|
||||
|
||||
=item * output_ssi
|
||||
|
||||
|
||||
=back
|
||||
|
||||
=head1 CAVEATS
|
||||
|
||||
Please do not assume that the interface here is stable. This is a first pass,
|
||||
and I'm still trying to incorporate suggestions from the community. If you
|
||||
employ this module somewhere, make doubly sure before upgrading that none of your
|
||||
code breaks when you use the newer version.
|
||||
|
||||
|
||||
=head1 BUGS
|
||||
|
||||
=over 4
|
||||
|
||||
=item * Embedded >s are broken
|
||||
|
||||
Won't handle tags with embedded >s in them, like
|
||||
<input name=expr value="x > y">. This will be fixed in a future
|
||||
version, probably by using the parse_args method. Suggestions are welcome.
|
||||
|
||||
=back
|
||||
|
||||
=head1 TO DO
|
||||
|
||||
=over 4
|
||||
|
||||
=item * extensibility
|
||||
|
||||
Based on a suggestion from Randy Harmon (thanks), I'd like to make it easier
|
||||
for subclasses of SimpleParse to pick out other kinds of HTML blocks, i.e.
|
||||
extend the set {text, comment, endtag, starttag, markup, ssi} to include more
|
||||
members. Currently the only easy way to do that is by overriding the
|
||||
C<parse> method:
|
||||
|
||||
sub parse { # In subclass
|
||||
my $self = $_[0];
|
||||
$self->SUPER::parse(@_);
|
||||
foreach ($self->tree) {
|
||||
if ($_->{content} =~ m#^a\s+#i) {
|
||||
$_->{type} = 'anchor_start';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sub output_anchor_start {
|
||||
# Whatever you want...
|
||||
}
|
||||
|
||||
Alternatively, this feature might be implemented by hanging attatchments
|
||||
onto the parsing loop, like this:
|
||||
|
||||
my $parser = new SimpleParse( $html_text );
|
||||
$regex = '<(a\s+.*?)>';
|
||||
$parser->watch_for( 'anchor_start', $regex );
|
||||
|
||||
sub SimpleParse::output_anchor_start {
|
||||
# Whatever you want...
|
||||
}
|
||||
|
||||
I think I like that idea better. If you wanted to, you could make a subclass
|
||||
with output_anchor_start as one of its methods, and put the ->watch_for
|
||||
stuff in the constructor.
|
||||
|
||||
|
||||
=item * reading from filehandles
|
||||
|
||||
It would be nice if you could initialize an object by giving it a filehandle
|
||||
or filename instead of the text itself.
|
||||
|
||||
=item * tests
|
||||
|
||||
I need to write a few tests that run under "make test".
|
||||
|
||||
|
||||
=back
|
||||
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Ken Williams <ken@forum.swarthmore.edu>
|
||||
|
||||
=head1 COPYRIGHT
|
||||
|
||||
Copyright 1998 Swarthmore College. All rights reserved.
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the same terms as Perl itself.
|
||||
|
||||
=cut
|
||||
471
database/perl/lib/HTML/Tagset.pm
Normal file
471
database/perl/lib/HTML/Tagset.pm
Normal file
@@ -0,0 +1,471 @@
|
||||
package HTML::Tagset;
|
||||
|
||||
use strict;
|
||||
|
||||
=head1 NAME
|
||||
|
||||
HTML::Tagset - data tables useful in parsing HTML
|
||||
|
||||
=head1 VERSION
|
||||
|
||||
Version 3.20
|
||||
|
||||
=cut
|
||||
|
||||
use vars qw( $VERSION );
|
||||
|
||||
$VERSION = '3.20';
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use HTML::Tagset;
|
||||
# Then use any of the items in the HTML::Tagset package
|
||||
# as need arises
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
This module contains several data tables useful in various kinds of
|
||||
HTML parsing operations.
|
||||
|
||||
Note that all tag names used are lowercase.
|
||||
|
||||
In the following documentation, a "hashset" is a hash being used as a
|
||||
set -- the hash conveys that its keys are there, and the actual values
|
||||
associated with the keys are not significant. (But what values are
|
||||
there, are always true.)
|
||||
|
||||
=cut
|
||||
|
||||
use vars qw(
|
||||
$VERSION
|
||||
%emptyElement %optionalEndTag %linkElements %boolean_attr
|
||||
%isHeadElement %isBodyElement %isPhraseMarkup
|
||||
%is_Possible_Strict_P_Content
|
||||
%isHeadOrBodyElement
|
||||
%isList %isTableElement %isFormElement
|
||||
%isKnown %canTighten
|
||||
@p_closure_barriers
|
||||
%isCDATA_Parent
|
||||
);
|
||||
|
||||
=head1 VARIABLES
|
||||
|
||||
Note that none of these variables are exported.
|
||||
|
||||
=head2 hashset %HTML::Tagset::emptyElement
|
||||
|
||||
This hashset has as values the tag-names (GIs) of elements that cannot
|
||||
have content. (For example, "base", "br", "hr".) So
|
||||
C<$HTML::Tagset::emptyElement{'hr'}> exists and is true.
|
||||
C<$HTML::Tagset::emptyElement{'dl'}> does not exist, and so is not true.
|
||||
|
||||
=cut
|
||||
|
||||
%emptyElement = map {; $_ => 1 } qw(base link meta isindex
|
||||
img br hr wbr
|
||||
input area param
|
||||
embed bgsound spacer
|
||||
basefont col frame
|
||||
~comment ~literal
|
||||
~declaration ~pi
|
||||
);
|
||||
# The "~"-initial names are for pseudo-elements used by HTML::Entities
|
||||
# and TreeBuilder
|
||||
|
||||
=head2 hashset %HTML::Tagset::optionalEndTag
|
||||
|
||||
This hashset lists tag-names for elements that can have content, but whose
|
||||
end-tags are generally, "safely", omissible. Example:
|
||||
C<$HTML::Tagset::emptyElement{'li'}> exists and is true.
|
||||
|
||||
=cut
|
||||
|
||||
%optionalEndTag = map {; $_ => 1 } qw(p li dt dd); # option th tr td);
|
||||
|
||||
=head2 hash %HTML::Tagset::linkElements
|
||||
|
||||
Values in this hash are tagnames for elements that might contain
|
||||
links, and the value for each is a reference to an array of the names
|
||||
of attributes whose values can be links.
|
||||
|
||||
=cut
|
||||
|
||||
%linkElements =
|
||||
(
|
||||
'a' => ['href'],
|
||||
'applet' => ['archive', 'codebase', 'code'],
|
||||
'area' => ['href'],
|
||||
'base' => ['href'],
|
||||
'bgsound' => ['src'],
|
||||
'blockquote' => ['cite'],
|
||||
'body' => ['background'],
|
||||
'del' => ['cite'],
|
||||
'embed' => ['pluginspage', 'src'],
|
||||
'form' => ['action'],
|
||||
'frame' => ['src', 'longdesc'],
|
||||
'iframe' => ['src', 'longdesc'],
|
||||
'ilayer' => ['background'],
|
||||
'img' => ['src', 'lowsrc', 'longdesc', 'usemap'],
|
||||
'input' => ['src', 'usemap'],
|
||||
'ins' => ['cite'],
|
||||
'isindex' => ['action'],
|
||||
'head' => ['profile'],
|
||||
'layer' => ['background', 'src'],
|
||||
'link' => ['href'],
|
||||
'object' => ['classid', 'codebase', 'data', 'archive', 'usemap'],
|
||||
'q' => ['cite'],
|
||||
'script' => ['src', 'for'],
|
||||
'table' => ['background'],
|
||||
'td' => ['background'],
|
||||
'th' => ['background'],
|
||||
'tr' => ['background'],
|
||||
'xmp' => ['href'],
|
||||
);
|
||||
|
||||
=head2 hash %HTML::Tagset::boolean_attr
|
||||
|
||||
This hash (not hashset) lists what attributes of what elements can be
|
||||
printed without showing the value (for example, the "noshade" attribute
|
||||
of "hr" elements). For elements with only one such attribute, its value
|
||||
is simply that attribute name. For elements with many such attributes,
|
||||
the value is a reference to a hashset containing all such attributes.
|
||||
|
||||
=cut
|
||||
|
||||
%boolean_attr = (
|
||||
# TODO: make these all hashes
|
||||
'area' => 'nohref',
|
||||
'dir' => 'compact',
|
||||
'dl' => 'compact',
|
||||
'hr' => 'noshade',
|
||||
'img' => 'ismap',
|
||||
'input' => { 'checked' => 1, 'readonly' => 1, 'disabled' => 1 },
|
||||
'menu' => 'compact',
|
||||
'ol' => 'compact',
|
||||
'option' => 'selected',
|
||||
'select' => 'multiple',
|
||||
'td' => 'nowrap',
|
||||
'th' => 'nowrap',
|
||||
'ul' => 'compact',
|
||||
);
|
||||
|
||||
#==========================================================================
|
||||
# List of all elements from Extensible HTML version 1.0 Transitional DTD:
|
||||
#
|
||||
# a abbr acronym address applet area b base basefont bdo big
|
||||
# blockquote body br button caption center cite code col colgroup
|
||||
# dd del dfn dir div dl dt em fieldset font form h1 h2 h3 h4 h5 h6
|
||||
# head hr html i iframe img input ins isindex kbd label legend li
|
||||
# link map menu meta noframes noscript object ol optgroup option p
|
||||
# param pre q s samp script select small span strike strong style
|
||||
# sub sup table tbody td textarea tfoot th thead title tr tt u ul
|
||||
# var
|
||||
#
|
||||
# Varia from Mozilla source internal table of tags:
|
||||
# Implemented:
|
||||
# xmp listing wbr nobr frame frameset noframes ilayer
|
||||
# layer nolayer spacer embed multicol
|
||||
# But these are unimplemented:
|
||||
# sound?? keygen?? server??
|
||||
# Also seen here and there:
|
||||
# marquee?? app?? (both unimplemented)
|
||||
#==========================================================================
|
||||
|
||||
=head2 hashset %HTML::Tagset::isPhraseMarkup
|
||||
|
||||
This hashset contains all phrasal-level elements.
|
||||
|
||||
=cut
|
||||
|
||||
%isPhraseMarkup = map {; $_ => 1 } qw(
|
||||
span abbr acronym q sub sup
|
||||
cite code em kbd samp strong var dfn strike
|
||||
b i u s tt small big
|
||||
a img br
|
||||
wbr nobr blink
|
||||
font basefont bdo
|
||||
spacer embed noembed
|
||||
); # had: center, hr, table
|
||||
|
||||
|
||||
=head2 hashset %HTML::Tagset::is_Possible_Strict_P_Content
|
||||
|
||||
This hashset contains all phrasal-level elements that be content of a
|
||||
P element, for a strict model of HTML.
|
||||
|
||||
=cut
|
||||
|
||||
%is_Possible_Strict_P_Content = (
|
||||
%isPhraseMarkup,
|
||||
%isFormElement,
|
||||
map {; $_ => 1} qw( object script map )
|
||||
# I've no idea why there's these latter exceptions.
|
||||
# I'm just following the HTML4.01 DTD.
|
||||
);
|
||||
|
||||
#from html4 strict:
|
||||
#<!ENTITY % fontstyle "TT | I | B | BIG | SMALL">
|
||||
#
|
||||
#<!ENTITY % phrase "EM | STRONG | DFN | CODE |
|
||||
# SAMP | KBD | VAR | CITE | ABBR | ACRONYM" >
|
||||
#
|
||||
#<!ENTITY % special
|
||||
# "A | IMG | OBJECT | BR | SCRIPT | MAP | Q | SUB | SUP | SPAN | BDO">
|
||||
#
|
||||
#<!ENTITY % formctrl "INPUT | SELECT | TEXTAREA | LABEL | BUTTON">
|
||||
#
|
||||
#<!-- %inline; covers inline or "text-level" elements -->
|
||||
#<!ENTITY % inline "#PCDATA | %fontstyle; | %phrase; | %special; | %formctrl;">
|
||||
|
||||
=head2 hashset %HTML::Tagset::isHeadElement
|
||||
|
||||
This hashset contains all elements that elements that should be
|
||||
present only in the 'head' element of an HTML document.
|
||||
|
||||
=cut
|
||||
|
||||
%isHeadElement = map {; $_ => 1 }
|
||||
qw(title base link meta isindex script style object bgsound);
|
||||
|
||||
=head2 hashset %HTML::Tagset::isList
|
||||
|
||||
This hashset contains all elements that can contain "li" elements.
|
||||
|
||||
=cut
|
||||
|
||||
%isList = map {; $_ => 1 } qw(ul ol dir menu);
|
||||
|
||||
=head2 hashset %HTML::Tagset::isTableElement
|
||||
|
||||
This hashset contains all elements that are to be found only in/under
|
||||
a "table" element.
|
||||
|
||||
=cut
|
||||
|
||||
%isTableElement = map {; $_ => 1 }
|
||||
qw(tr td th thead tbody tfoot caption col colgroup);
|
||||
|
||||
=head2 hashset %HTML::Tagset::isFormElement
|
||||
|
||||
This hashset contains all elements that are to be found only in/under
|
||||
a "form" element.
|
||||
|
||||
=cut
|
||||
|
||||
%isFormElement = map {; $_ => 1 }
|
||||
qw(input select option optgroup textarea button label);
|
||||
|
||||
=head2 hashset %HTML::Tagset::isBodyMarkup
|
||||
|
||||
This hashset contains all elements that are to be found only in/under
|
||||
the "body" element of an HTML document.
|
||||
|
||||
=cut
|
||||
|
||||
%isBodyElement = map {; $_ => 1 } qw(
|
||||
h1 h2 h3 h4 h5 h6
|
||||
p div pre plaintext address blockquote
|
||||
xmp listing
|
||||
center
|
||||
|
||||
multicol
|
||||
iframe ilayer nolayer
|
||||
bgsound
|
||||
|
||||
hr
|
||||
ol ul dir menu li
|
||||
dl dt dd
|
||||
ins del
|
||||
|
||||
fieldset legend
|
||||
|
||||
map area
|
||||
applet param object
|
||||
isindex script noscript
|
||||
table
|
||||
center
|
||||
form
|
||||
),
|
||||
keys %isFormElement,
|
||||
keys %isPhraseMarkup, # And everything phrasal
|
||||
keys %isTableElement,
|
||||
;
|
||||
|
||||
|
||||
=head2 hashset %HTML::Tagset::isHeadOrBodyElement
|
||||
|
||||
This hashset includes all elements that I notice can fall either in
|
||||
the head or in the body.
|
||||
|
||||
=cut
|
||||
|
||||
%isHeadOrBodyElement = map {; $_ => 1 }
|
||||
qw(script isindex style object map area param noscript bgsound);
|
||||
# i.e., if we find 'script' in the 'body' or the 'head', don't freak out.
|
||||
|
||||
|
||||
=head2 hashset %HTML::Tagset::isKnown
|
||||
|
||||
This hashset lists all known HTML elements.
|
||||
|
||||
=cut
|
||||
|
||||
%isKnown = (%isHeadElement, %isBodyElement,
|
||||
map{; $_=>1 }
|
||||
qw( head body html
|
||||
frame frameset noframes
|
||||
~comment ~pi ~directive ~literal
|
||||
));
|
||||
# that should be all known tags ever ever
|
||||
|
||||
|
||||
=head2 hashset %HTML::Tagset::canTighten
|
||||
|
||||
This hashset lists elements that might have ignorable whitespace as
|
||||
children or siblings.
|
||||
|
||||
=cut
|
||||
|
||||
%canTighten = %isKnown;
|
||||
delete @canTighten{
|
||||
keys(%isPhraseMarkup), 'input', 'select',
|
||||
'xmp', 'listing', 'plaintext', 'pre',
|
||||
};
|
||||
# xmp, listing, plaintext, and pre are untightenable, and
|
||||
# in a really special way.
|
||||
@canTighten{'hr','br'} = (1,1);
|
||||
# exceptional 'phrasal' things that ARE subject to tightening.
|
||||
|
||||
# The one case where I can think of my tightening rules failing is:
|
||||
# <p>foo bar<center> <em>baz quux</em> ...
|
||||
# ^-- that would get deleted.
|
||||
# But that's pretty gruesome code anyhow. You gets what you pays for.
|
||||
|
||||
#==========================================================================
|
||||
|
||||
=head2 array @HTML::Tagset::p_closure_barriers
|
||||
|
||||
This array has a meaning that I have only seen a need for in
|
||||
C<HTML::TreeBuilder>, but I include it here on the off chance that someone
|
||||
might find it of use:
|
||||
|
||||
When we see a "E<lt>pE<gt>" token, we go lookup up the lineage for a p
|
||||
element we might have to minimize. At first sight, we might say that
|
||||
if there's a p anywhere in the lineage of this new p, it should be
|
||||
closed. But that's wrong. Consider this document:
|
||||
|
||||
<html>
|
||||
<head>
|
||||
<title>foo</title>
|
||||
</head>
|
||||
<body>
|
||||
<p>foo
|
||||
<table>
|
||||
<tr>
|
||||
<td>
|
||||
foo
|
||||
<p>bar
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
The second p is quite legally inside a much higher p.
|
||||
|
||||
My formalization of the reason why this is legal, but this:
|
||||
|
||||
<p>foo<p>bar</p></p>
|
||||
|
||||
isn't, is that something about the table constitutes a "barrier" to
|
||||
the application of the rule about what p must minimize.
|
||||
|
||||
So C<@HTML::Tagset::p_closure_barriers> is the list of all such
|
||||
barrier-tags.
|
||||
|
||||
=cut
|
||||
|
||||
@p_closure_barriers = qw(
|
||||
li blockquote
|
||||
ul ol menu dir
|
||||
dl dt dd
|
||||
td th tr table caption
|
||||
div
|
||||
);
|
||||
|
||||
# In an ideal world (i.e., XHTML) we wouldn't have to bother with any of this
|
||||
# monkey business of barriers to minimization!
|
||||
|
||||
=head2 hashset %isCDATA_Parent
|
||||
|
||||
This hashset includes all elements whose content is CDATA.
|
||||
|
||||
=cut
|
||||
|
||||
%isCDATA_Parent = map {; $_ => 1 }
|
||||
qw(script style xmp listing plaintext);
|
||||
|
||||
# TODO: there's nothing else that takes CDATA children, right?
|
||||
|
||||
# As the HTML3 DTD (Raggett 1995-04-24) noted:
|
||||
# The XMP, LISTING and PLAINTEXT tags are incompatible with SGML
|
||||
# and derive from very early versions of HTML. They require non-
|
||||
# standard parsers and will cause problems for processing
|
||||
# documents with standard SGML tools.
|
||||
|
||||
|
||||
=head1 CAVEATS
|
||||
|
||||
You may find it useful to alter the behavior of modules (like
|
||||
C<HTML::Element> or C<HTML::TreeBuilder>) that use C<HTML::Tagset>'s
|
||||
data tables by altering the data tables themselves. You are welcome
|
||||
to try, but be careful; and be aware that different modules may or may
|
||||
react differently to the data tables being changed.
|
||||
|
||||
Note that it may be inappropriate to use these tables for I<producing>
|
||||
HTML -- for example, C<%isHeadOrBodyElement> lists the tagnames
|
||||
for all elements that can appear either in the head or in the body,
|
||||
such as "script". That doesn't mean that I am saying your code that
|
||||
produces HTML should feel free to put script elements in either place!
|
||||
If you are producing programs that spit out HTML, you should be
|
||||
I<intimately> familiar with the DTDs for HTML or XHTML (available at
|
||||
C<http://www.w3.org/>), and you should slavishly obey them, not
|
||||
the data tables in this document.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<HTML::Element>, L<HTML::TreeBuilder>, L<HTML::LinkExtor>
|
||||
|
||||
=head1 COPYRIGHT & LICENSE
|
||||
|
||||
Copyright 1995-2000 Gisle Aas.
|
||||
|
||||
Copyright 2000-2005 Sean M. Burke.
|
||||
|
||||
Copyright 2005-2008 Andy Lester.
|
||||
|
||||
This program is free software; you can redistribute it and/or modify it
|
||||
under the same terms as Perl itself.
|
||||
|
||||
=head1 ACKNOWLEDGEMENTS
|
||||
|
||||
Most of the code/data in this module was adapted from code written
|
||||
by Gisle Aas for C<HTML::Element>, C<HTML::TreeBuilder>, and
|
||||
C<HTML::LinkExtor>. Then it was maintained by Sean M. Burke.
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Current maintainer: Andy Lester, C<< <andy at petdance.com> >>
|
||||
|
||||
=head1 BUGS
|
||||
|
||||
Please report any bugs or feature requests to
|
||||
C<bug-html-tagset at rt.cpan.org>, or through the web interface at
|
||||
L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=HTML-Tagset>. I will
|
||||
be notified, and then you'll automatically be notified of progress on
|
||||
your bug as I make changes.
|
||||
|
||||
=cut
|
||||
|
||||
1;
|
||||
369
database/perl/lib/HTML/TokeParser.pm
Normal file
369
database/perl/lib/HTML/TokeParser.pm
Normal file
@@ -0,0 +1,369 @@
|
||||
package HTML::TokeParser;
|
||||
|
||||
require HTML::PullParser;
|
||||
@ISA=qw(HTML::PullParser);
|
||||
$VERSION = "3.57";
|
||||
|
||||
use strict;
|
||||
use Carp ();
|
||||
use HTML::Entities qw(decode_entities);
|
||||
use HTML::Tagset ();
|
||||
|
||||
my %ARGS =
|
||||
(
|
||||
start => "'S',tagname,attr,attrseq,text",
|
||||
end => "'E',tagname,text",
|
||||
text => "'T',text,is_cdata",
|
||||
process => "'PI',token0,text",
|
||||
comment => "'C',text",
|
||||
declaration => "'D',text",
|
||||
|
||||
# options that default on
|
||||
unbroken_text => 1,
|
||||
);
|
||||
|
||||
|
||||
sub new
|
||||
{
|
||||
my $class = shift;
|
||||
my %cnf;
|
||||
if (@_ == 1) {
|
||||
my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file";
|
||||
%cnf = ($type => $_[0]);
|
||||
}
|
||||
else {
|
||||
%cnf = @_;
|
||||
}
|
||||
|
||||
my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"};
|
||||
|
||||
my $self = $class->SUPER::new(%cnf, %ARGS) || return undef;
|
||||
|
||||
$self->{textify} = $textify;
|
||||
$self;
|
||||
}
|
||||
|
||||
|
||||
sub get_tag
|
||||
{
|
||||
my $self = shift;
|
||||
my $token;
|
||||
while (1) {
|
||||
$token = $self->get_token || return undef;
|
||||
my $type = shift @$token;
|
||||
next unless $type eq "S" || $type eq "E";
|
||||
substr($token->[0], 0, 0) = "/" if $type eq "E";
|
||||
return $token unless @_;
|
||||
for (@_) {
|
||||
return $token if $token->[0] eq $_;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
sub _textify {
|
||||
my($self, $token) = @_;
|
||||
my $tag = $token->[1];
|
||||
return undef unless exists $self->{textify}{$tag};
|
||||
|
||||
my $alt = $self->{textify}{$tag};
|
||||
my $text;
|
||||
if (ref($alt)) {
|
||||
$text = &$alt(@$token);
|
||||
} else {
|
||||
$text = $token->[2]{$alt || "alt"};
|
||||
$text = "[\U$tag]" unless defined $text;
|
||||
}
|
||||
return $text;
|
||||
}
|
||||
|
||||
|
||||
sub get_text
|
||||
{
|
||||
my $self = shift;
|
||||
my @text;
|
||||
while (my $token = $self->get_token) {
|
||||
my $type = $token->[0];
|
||||
if ($type eq "T") {
|
||||
my $text = $token->[1];
|
||||
decode_entities($text) unless $token->[2];
|
||||
push(@text, $text);
|
||||
} elsif ($type =~ /^[SE]$/) {
|
||||
my $tag = $token->[1];
|
||||
if ($type eq "S") {
|
||||
if (defined(my $text = _textify($self, $token))) {
|
||||
push(@text, $text);
|
||||
next;
|
||||
}
|
||||
} else {
|
||||
$tag = "/$tag";
|
||||
}
|
||||
if (!@_ || grep $_ eq $tag, @_) {
|
||||
$self->unget_token($token);
|
||||
last;
|
||||
}
|
||||
push(@text, " ")
|
||||
if $tag eq "br" || !$HTML::Tagset::isPhraseMarkup{$token->[1]};
|
||||
}
|
||||
}
|
||||
join("", @text);
|
||||
}
|
||||
|
||||
|
||||
sub get_trimmed_text
|
||||
{
|
||||
my $self = shift;
|
||||
my $text = $self->get_text(@_);
|
||||
$text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
|
||||
$text;
|
||||
}
|
||||
|
||||
sub get_phrase {
|
||||
my $self = shift;
|
||||
my @text;
|
||||
while (my $token = $self->get_token) {
|
||||
my $type = $token->[0];
|
||||
if ($type eq "T") {
|
||||
my $text = $token->[1];
|
||||
decode_entities($text) unless $token->[2];
|
||||
push(@text, $text);
|
||||
} elsif ($type =~ /^[SE]$/) {
|
||||
my $tag = $token->[1];
|
||||
if ($type eq "S") {
|
||||
if (defined(my $text = _textify($self, $token))) {
|
||||
push(@text, $text);
|
||||
next;
|
||||
}
|
||||
}
|
||||
if (!$HTML::Tagset::isPhraseMarkup{$tag}) {
|
||||
$self->unget_token($token);
|
||||
last;
|
||||
}
|
||||
push(@text, " ") if $tag eq "br";
|
||||
}
|
||||
}
|
||||
my $text = join("", @text);
|
||||
$text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
|
||||
$text;
|
||||
}
|
||||
|
||||
1;
|
||||
|
||||
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
HTML::TokeParser - Alternative HTML::Parser interface
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
require HTML::TokeParser;
|
||||
$p = HTML::TokeParser->new("index.html") ||
|
||||
die "Can't open: $!";
|
||||
$p->empty_element_tags(1); # configure its behaviour
|
||||
|
||||
while (my $token = $p->get_token) {
|
||||
#...
|
||||
}
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
The C<HTML::TokeParser> is an alternative interface to the
|
||||
C<HTML::Parser> class. It is an C<HTML::PullParser> subclass with a
|
||||
predeclared set of token types. If you wish the tokens to be reported
|
||||
differently you probably want to use the C<HTML::PullParser> directly.
|
||||
|
||||
The following methods are available:
|
||||
|
||||
=over 4
|
||||
|
||||
=item $p = HTML::TokeParser->new( $filename, %opt );
|
||||
|
||||
=item $p = HTML::TokeParser->new( $filehandle, %opt );
|
||||
|
||||
=item $p = HTML::TokeParser->new( \$document, %opt );
|
||||
|
||||
The object constructor argument is either a file name, a file handle
|
||||
object, or the complete document to be parsed. Extra options can be
|
||||
provided as key/value pairs and are processed as documented by the base
|
||||
classes.
|
||||
|
||||
If the argument is a plain scalar, then it is taken as the name of a
|
||||
file to be opened and parsed. If the file can't be opened for
|
||||
reading, then the constructor will return C<undef> and $! will tell
|
||||
you why it failed.
|
||||
|
||||
If the argument is a reference to a plain scalar, then this scalar is
|
||||
taken to be the literal document to parse. The value of this
|
||||
scalar should not be changed before all tokens have been extracted.
|
||||
|
||||
Otherwise the argument is taken to be some object that the
|
||||
C<HTML::TokeParser> can read() from when it needs more data. Typically
|
||||
it will be a filehandle of some kind. The stream will be read() until
|
||||
EOF, but not closed.
|
||||
|
||||
A newly constructed C<HTML::TokeParser> differ from its base classes
|
||||
by having the C<unbroken_text> attribute enabled by default. See
|
||||
L<HTML::Parser> for a description of this and other attributes that
|
||||
influence how the document is parsed. It is often a good idea to enable
|
||||
C<empty_element_tags> behaviour.
|
||||
|
||||
Note that the parsing result will likely not be valid if raw undecoded
|
||||
UTF-8 is used as a source. When parsing UTF-8 encoded files turn
|
||||
on UTF-8 decoding:
|
||||
|
||||
open(my $fh, "<:utf8", "index.html") || die "Can't open 'index.html': $!";
|
||||
my $p = HTML::TokeParser->new( $fh );
|
||||
# ...
|
||||
|
||||
If a $filename is passed to the constructor the file will be opened in
|
||||
raw mode and the parsing result will only be valid if its content is
|
||||
Latin-1 or pure ASCII.
|
||||
|
||||
If parsing from an UTF-8 encoded string buffer decode it first:
|
||||
|
||||
utf8::decode($document);
|
||||
my $p = HTML::TokeParser->new( \$document );
|
||||
# ...
|
||||
|
||||
=item $p->get_token
|
||||
|
||||
This method will return the next I<token> found in the HTML document,
|
||||
or C<undef> at the end of the document. The token is returned as an
|
||||
array reference. The first element of the array will be a string
|
||||
denoting the type of this token: "S" for start tag, "E" for end tag,
|
||||
"T" for text, "C" for comment, "D" for declaration, and "PI" for
|
||||
process instructions. The rest of the token array depend on the type
|
||||
like this:
|
||||
|
||||
["S", $tag, $attr, $attrseq, $text]
|
||||
["E", $tag, $text]
|
||||
["T", $text, $is_data]
|
||||
["C", $text]
|
||||
["D", $text]
|
||||
["PI", $token0, $text]
|
||||
|
||||
where $attr is a hash reference, $attrseq is an array reference and
|
||||
the rest are plain scalars. The L<HTML::Parser/Argspec> explains the
|
||||
details.
|
||||
|
||||
=item $p->unget_token( @tokens )
|
||||
|
||||
If you find you have read too many tokens you can push them back,
|
||||
so that they are returned the next time $p->get_token is called.
|
||||
|
||||
=item $p->get_tag
|
||||
|
||||
=item $p->get_tag( @tags )
|
||||
|
||||
This method returns the next start or end tag (skipping any other
|
||||
tokens), or C<undef> if there are no more tags in the document. If
|
||||
one or more arguments are given, then we skip tokens until one of the
|
||||
specified tag types is found. For example:
|
||||
|
||||
$p->get_tag("font", "/font");
|
||||
|
||||
will find the next start or end tag for a font-element.
|
||||
|
||||
The tag information is returned as an array reference in the same form
|
||||
as for $p->get_token above, but the type code (first element) is
|
||||
missing. A start tag will be returned like this:
|
||||
|
||||
[$tag, $attr, $attrseq, $text]
|
||||
|
||||
The tagname of end tags are prefixed with "/", i.e. end tag is
|
||||
returned like this:
|
||||
|
||||
["/$tag", $text]
|
||||
|
||||
=item $p->get_text
|
||||
|
||||
=item $p->get_text( @endtags )
|
||||
|
||||
This method returns all text found at the current position. It will
|
||||
return a zero length string if the next token is not text. Any
|
||||
entities will be converted to their corresponding character.
|
||||
|
||||
If one or more arguments are given, then we return all text occurring
|
||||
before the first of the specified tags found. For example:
|
||||
|
||||
$p->get_text("p", "br");
|
||||
|
||||
will return the text up to either a paragraph of linebreak element.
|
||||
|
||||
The text might span tags that should be I<textified>. This is
|
||||
controlled by the $p->{textify} attribute, which is a hash that
|
||||
defines how certain tags can be treated as text. If the name of a
|
||||
start tag matches a key in this hash then this tag is converted to
|
||||
text. The hash value is used to specify which tag attribute to obtain
|
||||
the text from. If this tag attribute is missing, then the upper case
|
||||
name of the tag enclosed in brackets is returned, e.g. "[IMG]". The
|
||||
hash value can also be a subroutine reference. In this case the
|
||||
routine is called with the start tag token content as its argument and
|
||||
the return value is treated as the text.
|
||||
|
||||
The default $p->{textify} value is:
|
||||
|
||||
{img => "alt", applet => "alt"}
|
||||
|
||||
This means that <IMG> and <APPLET> tags are treated as text, and that
|
||||
the text to substitute can be found in the ALT attribute.
|
||||
|
||||
=item $p->get_trimmed_text
|
||||
|
||||
=item $p->get_trimmed_text( @endtags )
|
||||
|
||||
Same as $p->get_text above, but will collapse any sequences of white
|
||||
space to a single space character. Leading and trailing white space is
|
||||
removed.
|
||||
|
||||
=item $p->get_phrase
|
||||
|
||||
This will return all text found at the current position ignoring any
|
||||
phrasal-level tags. Text is extracted until the first non
|
||||
phrasal-level tag. Textification of tags is the same as for
|
||||
get_text(). This method will collapse white space in the same way as
|
||||
get_trimmed_text() does.
|
||||
|
||||
The definition of <i>phrasal-level tags</i> is obtained from the
|
||||
HTML::Tagset module.
|
||||
|
||||
=back
|
||||
|
||||
=head1 EXAMPLES
|
||||
|
||||
This example extracts all links from a document. It will print one
|
||||
line for each link, containing the URL and the textual description
|
||||
between the <A>...</A> tags:
|
||||
|
||||
use HTML::TokeParser;
|
||||
$p = HTML::TokeParser->new(shift||"index.html");
|
||||
|
||||
while (my $token = $p->get_tag("a")) {
|
||||
my $url = $token->[1]{href} || "-";
|
||||
my $text = $p->get_trimmed_text("/a");
|
||||
print "$url\t$text\n";
|
||||
}
|
||||
|
||||
This example extract the <TITLE> from the document:
|
||||
|
||||
use HTML::TokeParser;
|
||||
$p = HTML::TokeParser->new(shift||"index.html");
|
||||
if ($p->get_tag("title")) {
|
||||
my $title = $p->get_trimmed_text;
|
||||
print "Title: $title\n";
|
||||
}
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<HTML::PullParser>, L<HTML::Parser>
|
||||
|
||||
=head1 COPYRIGHT
|
||||
|
||||
Copyright 1998-2005 Gisle Aas.
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the same terms as Perl itself.
|
||||
|
||||
=cut
|
||||
Reference in New Issue
Block a user