Initial Commit

2025-12-03 16:38:10 +01:00
parent c5e26bf594
commit b732d8d4b5
17680 changed files with 5977495 additions and 2 deletions
--- a/database/perl/lib/HTML/SimpleParse.pm
+++ b/database/perl/lib/HTML/SimpleParse.pm
@@ -0,0 +1,453 @@
+package HTML::SimpleParse;
+
+use strict;
+use vars qw($VERSION $FIX_CASE);
+
+$VERSION = '0.12';
+my $debug = 0;
+
+sub new {
+  my $pack = shift;
+  
+  my $self = bless {
+		    'text' => shift(),
+		    'tree' => [],
+		    @_
+		   }, $pack;
+  
+  $self->parse() if defined $self->{'text'} and length $self->{'text'};
+  return $self;
+}
+
+sub text {
+  my $self = shift;
+  $self->{'text'} = shift if @_;
+  return $self->{'text'};
+}
+
+sub tree { @{$_[0]->{'tree'}} }
+
+sub parse {
+  # Much of this is a dumbed-down version of HTML::Parser::parse.
+  
+  my $self = shift;
+  my $text = \ $self->{'text'};
+  my $tree = $self->{'tree'};
+  
+  # Parse html text in $$text.  The strategy is to remove complete
+  # tokens from the beginning of $$text until we can't decide whether
+  # it is a token or not, or the $$text is empty.
+  
+  @$tree = ();
+  while (1) {
+    my ($content, $type);
+    
+    # First we try to pull off any plain text (anything before a "<" char)
+    if ($$text =~ /\G([^<]+)/gcs) {
+      $content = $1; $type = 'text';
+      
+      # Then, SSI, comments, and markup declarations (usually <!DOCTYPE...>)
+      # ssi:     <!--#stuff-->
+      # comment: <!--stuff-->
+      # markup:  <!stuff>
+    } elsif ($$text =~ /\G<(!--(\#?).*?--)>/gcs) {
+      $type = ($2 ? 'ssi' : 'comment');
+      $content = $1;
+      
+    } elsif ($$text =~ /\G<(!.*?)>/gcs) {
+      $type = 'markup';
+      $content = $1;
+      
+      # Then, look for an end tag
+    } elsif ($$text =~ m|\G<(/[a-zA-Z][a-zA-Z0-9\.\-]*\s*)>|gcs) {
+      $content = $1; $type = 'endtag';
+      
+      # Then, finally we look for a start tag
+      # We know the first char is <, make sure there's a >
+    } elsif ($$text =~ /\G<(.*?)>/gcs) {
+      $content = $1; $type = 'starttag';
+      
+    } else {
+      # the string is exhausted, or there's no > in it.
+      push @$tree, {
+		    'content' => substr($$text, pos $$text),
+		    'type'    => 'text',
+		   } unless pos($$text) eq length($$text);
+      last;
+    }
+    
+    push @$tree, {
+		  'content' => $content,
+		  'type'    => $type,
+		  'offset'  => ($type eq 'text' ? 
+				pos($$text) - length($content) : 
+				pos($$text) - length($content) - 2),
+		 };
+  }
+  
+  $self;
+}
+
+
+$FIX_CASE = 1;
+sub parse_args {
+  my $self = shift;  # Not needed here
+  my $str = shift;
+  my $fix_case = ((ref $self and exists $self->{fix_case}) ? $self->{fix_case} : $FIX_CASE);
+  my @returns;
+  
+  # Make sure we start searching at the beginning of the string
+  pos($str) = 0;
+  
+  while (1) {
+    next if $str =~ m/\G\s+/gc;  # Get rid of leading whitespace
+    
+    if ( $str =~ m/\G
+	 ([\w.-]+)\s*=\s*                         # the key
+	 (?:
+	  "([^\"\\]*  (?: \\.[^\"\\]* )* )"\s*    # quoted string, with possible whitespace inside,
+	  |                                       #  or
+	  '([^\'\\]*  (?: \\.[^\'\\]* )* )'\s*    # quoted string, with possible whitespace inside,
+	  |                                       #  or
+	  ([^\s>]*)\s*                            # anything else, without whitespace or >
+	 )/gcx ) {
+      
+      my ($key, $val) = ($1, $+);
+      $val =~ s/\\(.)/$1/gs;
+      push @returns, ($fix_case==1 ? uc($key) : $fix_case==-1 ? lc($key) : $key), $val;
+      
+    } elsif ( $str =~ m,\G/?([\w.-]+)\s*,gc ) {
+      push @returns, ($fix_case==1 ? uc($1)   : $fix_case==-1 ? lc($1)   : $1  ), undef;
+    } else {
+      last;
+    }
+  }
+  
+  return @returns;
+}
+
+
+sub execute {
+  my $self = shift;
+  my $ref = shift;
+  my $method = "output_$ref->{type}";
+  warn "calling $self->$method(...)" if $debug;
+  return $self->$method($ref->{content});
+}
+
+sub get_output {
+  my $self = shift;
+  my ($method, $out) = ('', '');
+  foreach ($self->tree) {
+    $out .= $self->execute($_);
+  }
+  return $out;
+}
+
+
+sub output {
+  my $self = shift;
+  my $method;
+  foreach ($self->tree) {
+    print $self->execute($_);
+  }
+}
+
+sub output_text      {   $_[1]; }
+sub output_comment   { "<$_[1]>"; }
+sub output_endtag    { "<$_[1]>"; }
+sub output_starttag  { "<$_[1]>"; }
+sub output_markup    { "<$_[1]>"; }
+sub output_ssi       { "<$_[1]>"; }
+
+1;
+__END__
+
+=head1 NAME
+
+HTML::SimpleParse - a bare-bones HTML parser
+
+=head1 SYNOPSIS
+
+ use HTML::SimpleParse;
+
+ # Parse the text into a simple tree
+ my $p = new HTML::SimpleParse( $html_text );
+ $p->output;                 # Output the HTML verbatim
+ 
+ $p->text( $new_text );      # Give it some new HTML to chew on
+ $p->parse                   # Parse the new HTML
+ $p->output;
+
+ my %attrs = HTML::SimpleParse->parse_args('A="xx" B=3');
+ # %attrs is now ('A' => 'xx', 'B' => '3')
+
+=head1 DESCRIPTION
+
+This module is a simple HTML parser.  It is similar in concept to HTML::Parser,
+but it differs from HTML::TreeBuilder in a couple of important ways.
+
+First, HTML::TreeBuilder knows which tags can contain other tags, which
+start tags have corresponding end tags, which tags can exist only in
+the <HEAD> portion of the document, and so forth.  HTML::SimpleParse
+does not know any of these things.  It just finds tags and text in the
+HTML you give it, it does not care about the specific content of these
+tags (though it does distiguish between different _types_ of tags,
+such as comments, starting tags like <b>, ending tags like </b>, and
+so on).
+
+Second, HTML::SimpleParse does not create a hierarchical tree of HTML content,
+but rather a simple linear list.  It does not pay any attention to balancing
+start tags with corresponding end tags, or which pairs of tags are inside other
+pairs of tags.
+
+Because of these characteristics, you can make a very effective HTML
+filter by sub-classing HTML::SimpleParse.  For example, to remove all comments 
+from HTML:
+
+ package NoComment;
+ use HTML::SimpleParse;
+ @ISA = qw(HTML::SimpleParse);
+ sub output_comment {}
+ 
+ package main;
+ NoComment->new($some_html)->output;
+
+Historically, I started the HTML::SimpleParse project in part because
+of a misunderstanding about HTML::Parser's functionality.  Many
+aspects of these two modules actually overlap.  I continue to maintain
+the HTML::SimpleParse module because people seem to be depending on
+it, and because beginners sometimes find HTML::SimpleParse to be
+simpler than HTML::Parser's more powerful interface.  People also seem
+to get a fair amount of usage out of the C<parse_args()> method
+directly.
+
+=head2 Methods
+
+=over 4
+
+=item * new
+
+ $p = new HTML::SimpleParse( $some_html );
+
+Creates a new HTML::SimpleParse object.  Optionally takes one argument,
+a string containing some HTML with which to initialize the object.  If
+you give it a non-empty string, the HTML will be parsed into a tree and 
+ready for outputting.
+
+Can also take a list of attributes, such as
+
+ $p = new HTML::SimpleParse( $some_html, 'fix_case' => -1);
+
+See the C<parse_args()> method below for an explanation of this attribute.
+
+=item * text
+
+ $text = $p->text;
+ $p->text( $new_text );
+
+Get or set the contents of the HTML to be parsed.
+
+=item * tree
+
+ foreach ($p->tree) { ... }
+
+Returns a list of all the nodes in the tree, in case you want to step
+through them manually or something.  Each node in the tree is an
+anonymous hash with (at least) three data members, $node->{type} (is
+this a comment, a start tag, an end tag, etc.), $node->{content} (all
+the text between the angle brackets, verbatim), and $node->{offset}
+(number of bytes from the beginning of the string).
+
+The possible values of $node->{type} are C<text>, C<starttag>,
+C<endtag>, C<ssi>, and C<markup>.
+
+=item * parse
+
+ $p->parse;
+
+Once an object has been initialized with some text, call $p->parse and 
+a tree will be created.  After the tree is created, you can call $p->output.
+If you feed some text to the new() method, parse will be called automatically
+during your object's construction.
+
+=item * parse_args
+
+ %hash = $p->parse_args( $arg_string );
+
+This routine is handy for parsing the contents of an HTML tag into key=value
+pairs.  For instance:
+
+  $text = 'type=checkbox checked name=flavor value="chocolate or strawberry"';
+  %hash = $p->parse_args( $text );
+  # %hash is ( TYPE=>'checkbox', CHECKED=>undef, NAME=>'flavor',
+  #            VALUE=>'chocolate or strawberry' )
+
+Note that the position of the last m//g search on the string (the value 
+returned by Perl's pos() function) will be altered by the parse_args function,
+so make sure you take that into account if (in the above example) you do
+C<$text =~ m/something/g>.
+
+The parse_args() method can be run as either an object method or as a
+class method, i.e. as either $p->parse_args(...) or
+HTML::SimpleParse->parse_args(...).
+
+HTML attribute lists are supposed to be case-insensitive with respect
+to attribute names.  To achieve this behavior, parse_args() respects
+the 'fix_case' flag, which can be set either as a package global
+$FIX_CASE, or as a class member datum 'fix_case'.  If set to 0, no
+case conversion is done.  If set to 1, all keys are converted to upper
+case.  If set to -1, all keys are converted to lower case.  The
+default is 1, i.e. all keys are uppercased.
+
+If an attribute takes no value (like "checked" in the above example) then it
+will still have an entry in the returned hash, but its value will be C<undef>.
+For example:
+
+  %hash = $p->parse_args('type=checkbox checked name=banana value=""');
+  # $hash{CHECKED} is undef, but $hash{VALUE} is ""
+
+This method actually returns a list (not a hash), so duplicate attributes and
+order will be preserved if you want them to be:
+
+ @hash = $p->parse_args("name=family value=gwen value=mom value=pop");
+ # @hash is qw(NAME family VALUE gwen VALUE mom VALUE pop)
+
+=item * output
+
+ $p->output;
+
+This will output the contents of the HTML, passing the real work off to
+the output_text, output_comment, etc. functions.  If you do not override any
+of these methods, this module will output the exact text that it parsed into
+a tree in the first place.
+
+=item * get_output
+
+ print $p->get_output
+
+Similar to $p->output(), but returns its result instead of printing it.
+
+=item * execute
+
+ foreach ($p->tree) {
+    print $p->execute($_);
+ }
+
+Executes a single node in the HTML parse tree.  Useful if you want to loop
+through the nodes and output them individually.
+
+=back
+
+The following methods do the actual outputting of the various parts of
+the HTML.  Override some of them if you want to change the way the HTML
+is output.  For instance, to strip comments from the HTML, override the
+output_comment method like so:
+
+ # In subclass:
+ sub output_comment { }  # Does nothing
+
+=over 4
+
+=item * output_text
+
+=item * output_comment
+
+=item * output_endtag
+
+=item * output_starttag
+
+=item * output_markup
+
+=item * output_ssi
+
+
+=back
+
+=head1 CAVEATS
+
+Please do not assume that the interface here is stable.  This is a first pass, 
+and I'm still trying to incorporate suggestions from the community.  If you
+employ this module somewhere, make doubly sure before upgrading that none of your
+code breaks when you use the newer version.
+
+
+=head1 BUGS
+
+=over 4
+
+=item * Embedded >s are broken
+
+Won't handle tags with embedded >s in them, like
+<input name=expr value="x > y">.  This will be fixed in a future
+version, probably by using the parse_args method.  Suggestions are welcome.
+
+=back
+
+=head1 TO DO
+
+=over 4
+
+=item * extensibility
+
+Based on a suggestion from Randy Harmon (thanks), I'd like to make it easier
+for subclasses of SimpleParse to pick out other kinds of HTML blocks, i.e.
+extend the set {text, comment, endtag, starttag, markup, ssi} to include more
+members.  Currently the only easy way to do that is by overriding the 
+C<parse> method:
+
+ sub parse {  # In subclass
+    my $self = $_[0];
+    $self->SUPER::parse(@_);
+    foreach ($self->tree) {
+       if ($_->{content} =~ m#^a\s+#i) {
+          $_->{type} = 'anchor_start';
+       }
+    }
+ }
+
+ sub output_anchor_start {
+    # Whatever you want...
+ }
+
+Alternatively, this feature might be implemented by hanging attatchments 
+onto the parsing loop, like this:
+
+ my $parser = new SimpleParse( $html_text );
+ $regex = '<(a\s+.*?)>';
+ $parser->watch_for( 'anchor_start', $regex );
+ 
+ sub SimpleParse::output_anchor_start {
+    # Whatever you want...
+ }
+
+I think I like that idea better.  If you wanted to, you could make a subclass
+with output_anchor_start as one of its methods, and put the ->watch_for 
+stuff in the constructor.
+
+
+=item * reading from filehandles
+
+It would be nice if you could initialize an object by giving it a filehandle
+or filename instead of the text itself.
+
+=item * tests
+
+I need to write a few tests that run under "make test".
+
+
+=back
+
+
+=head1 AUTHOR
+
+Ken Williams <ken@forum.swarthmore.edu>
+
+=head1 COPYRIGHT
+
+Copyright 1998 Swarthmore College.  All rights reserved.
+
+This library is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=cut