604 lines
17 KiB
Perl
604 lines
17 KiB
Perl
# $Id$
|
|
|
|
package XML::SAX::PurePerl;
|
|
|
|
use strict;
|
|
use XML::SAX::PurePerl::Productions qw($SingleChar);
|
|
|
|
sub elementdecl {
|
|
my ($self, $reader) = @_;
|
|
|
|
my $data = $reader->data(9);
|
|
return 0 unless $data =~ /^<!ELEMENT/;
|
|
$reader->move_along(9);
|
|
|
|
$self->skip_whitespace($reader) ||
|
|
$self->parser_error("No whitespace after ELEMENT declaration", $reader);
|
|
|
|
my $name = $self->Name($reader);
|
|
|
|
$self->skip_whitespace($reader) ||
|
|
$self->parser_error("No whitespace after ELEMENT's name", $reader);
|
|
|
|
$self->contentspec($reader, $name);
|
|
|
|
$self->skip_whitespace($reader);
|
|
|
|
$reader->match('>') or $self->parser_error("Closing angle bracket not found on ELEMENT declaration", $reader);
|
|
|
|
return 1;
|
|
}
|
|
|
|
sub contentspec {
|
|
my ($self, $reader, $name) = @_;
|
|
|
|
my $data = $reader->data(5);
|
|
|
|
my $model;
|
|
if ($data =~ /^EMPTY/) {
|
|
$reader->move_along(5);
|
|
$model = 'EMPTY';
|
|
}
|
|
elsif ($data =~ /^ANY/) {
|
|
$reader->move_along(3);
|
|
$model = 'ANY';
|
|
}
|
|
else {
|
|
$model = $self->Mixed_or_children($reader);
|
|
}
|
|
|
|
if ($model) {
|
|
# call SAX callback now.
|
|
$self->element_decl({Name => $name, Model => $model});
|
|
return 1;
|
|
}
|
|
|
|
$self->parser_error("contentspec not found in ELEMENT declaration", $reader);
|
|
}
|
|
|
|
sub Mixed_or_children {
|
|
my ($self, $reader) = @_;
|
|
|
|
my $data = $reader->data(8);
|
|
$data =~ /^\(/ or return; # $self->parser_error("No opening bracket in Mixed or children", $reader);
|
|
|
|
if ($data =~ /^\(\s*\#PCDATA/) {
|
|
$reader->match('(');
|
|
$self->skip_whitespace($reader);
|
|
$reader->move_along(7);
|
|
my $model = $self->Mixed($reader);
|
|
return $model;
|
|
}
|
|
|
|
# not matched - must be Children
|
|
return $self->children($reader);
|
|
}
|
|
|
|
# Mixed ::= ( '(' S* PCDATA ( S* '|' S* QName )* S* ')' '*' )
|
|
# | ( '(' S* PCDATA S* ')' )
|
|
sub Mixed {
|
|
my ($self, $reader) = @_;
|
|
|
|
# Mixed_or_children already matched '(' S* '#PCDATA'
|
|
|
|
my $model = '(#PCDATA';
|
|
|
|
$self->skip_whitespace($reader);
|
|
|
|
my %seen;
|
|
|
|
while (1) {
|
|
last unless $reader->match('|');
|
|
$self->skip_whitespace($reader);
|
|
|
|
my $name = $self->Name($reader) ||
|
|
$self->parser_error("No 'Name' after Mixed content '|'", $reader);
|
|
|
|
if ($seen{$name}) {
|
|
$self->parser_error("Element '$name' has already appeared in this group", $reader);
|
|
}
|
|
$seen{$name}++;
|
|
|
|
$model .= "|$name";
|
|
|
|
$self->skip_whitespace($reader);
|
|
}
|
|
|
|
$reader->match(')') || $self->parser_error("no closing bracket on mixed content", $reader);
|
|
|
|
$model .= ")";
|
|
|
|
if ($reader->match('*')) {
|
|
$model .= "*";
|
|
}
|
|
|
|
return $model;
|
|
}
|
|
|
|
# [[47]] Children ::= ChoiceOrSeq Cardinality?
|
|
# [[48]] Cp ::= ( QName | ChoiceOrSeq ) Cardinality?
|
|
# ChoiceOrSeq ::= '(' S* Cp ( Choice | Seq )? S* ')'
|
|
# [[49]] Choice ::= ( S* '|' S* Cp )+
|
|
# [[50]] Seq ::= ( S* ',' S* Cp )+
|
|
# // Children ::= (Choice | Seq) Cardinality?
|
|
# // Cp ::= ( QName | Choice | Seq) Cardinality?
|
|
# // Choice ::= '(' S* Cp ( S* '|' S* Cp )+ S* ')'
|
|
# // Seq ::= '(' S* Cp ( S* ',' S* Cp )* S* ')'
|
|
# [[51]] Mixed ::= ( '(' S* PCDATA ( S* '|' S* QName )* S* ')' MixedCardinality )
|
|
# | ( '(' S* PCDATA S* ')' )
|
|
# Cardinality ::= '?' | '+' | '*'
|
|
# MixedCardinality ::= '*'
|
|
sub children {
|
|
my ($self, $reader) = @_;
|
|
|
|
return $self->ChoiceOrSeq($reader) . $self->Cardinality($reader);
|
|
}
|
|
|
|
sub ChoiceOrSeq {
|
|
my ($self, $reader) = @_;
|
|
|
|
$reader->match('(') or $self->parser_error("choice/seq contains no opening bracket", $reader);
|
|
|
|
my $model = '(';
|
|
|
|
$self->skip_whitespace($reader);
|
|
|
|
$model .= $self->Cp($reader);
|
|
|
|
if (my $choice = $self->Choice($reader)) {
|
|
$model .= $choice;
|
|
}
|
|
else {
|
|
$model .= $self->Seq($reader);
|
|
}
|
|
|
|
$self->skip_whitespace($reader);
|
|
|
|
$reader->match(')') or $self->parser_error("choice/seq contains no closing bracket", $reader);
|
|
|
|
$model .= ')';
|
|
|
|
return $model;
|
|
}
|
|
|
|
sub Cardinality {
|
|
my ($self, $reader) = @_;
|
|
# cardinality is always optional
|
|
my $data = $reader->data;
|
|
if ($data =~ /^([\?\+\*])/) {
|
|
$reader->move_along(1);
|
|
return $1;
|
|
}
|
|
return '';
|
|
}
|
|
|
|
sub Cp {
|
|
my ($self, $reader) = @_;
|
|
|
|
my $model;
|
|
my $name = eval
|
|
{
|
|
if (my $name = $self->Name($reader)) {
|
|
return $name . $self->Cardinality($reader);
|
|
}
|
|
};
|
|
return $name if defined $name;
|
|
return $self->ChoiceOrSeq($reader) . $self->Cardinality($reader);
|
|
}
|
|
|
|
sub Choice {
|
|
my ($self, $reader) = @_;
|
|
|
|
my $model = '';
|
|
$self->skip_whitespace($reader);
|
|
|
|
while ($reader->match('|')) {
|
|
$self->skip_whitespace($reader);
|
|
$model .= '|';
|
|
$model .= $self->Cp($reader);
|
|
$self->skip_whitespace($reader);
|
|
}
|
|
|
|
return $model;
|
|
}
|
|
|
|
sub Seq {
|
|
my ($self, $reader) = @_;
|
|
|
|
my $model = '';
|
|
$self->skip_whitespace($reader);
|
|
|
|
while ($reader->match(',')) {
|
|
$self->skip_whitespace($reader);
|
|
my $cp = $self->Cp($reader);
|
|
if ($cp) {
|
|
$model .= ',';
|
|
$model .= $cp;
|
|
}
|
|
$self->skip_whitespace($reader);
|
|
}
|
|
|
|
return $model;
|
|
}
|
|
|
|
sub AttlistDecl {
|
|
my ($self, $reader) = @_;
|
|
|
|
my $data = $reader->data(9);
|
|
if ($data =~ /^<!ATTLIST/) {
|
|
# It's an attlist
|
|
|
|
$reader->move_along(9);
|
|
|
|
$self->skip_whitespace($reader) ||
|
|
$self->parser_error("No whitespace after ATTLIST declaration", $reader);
|
|
my $name = $self->Name($reader);
|
|
|
|
$self->AttDefList($reader, $name);
|
|
|
|
$self->skip_whitespace($reader);
|
|
|
|
$reader->match('>') or $self->parser_error("Closing angle bracket not found on ATTLIST declaration", $reader);
|
|
|
|
return 1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
sub AttDefList {
|
|
my ($self, $reader, $name) = @_;
|
|
|
|
1 while $self->AttDef($reader, $name);
|
|
}
|
|
|
|
sub AttDef {
|
|
my ($self, $reader, $el_name) = @_;
|
|
|
|
$self->skip_whitespace($reader) || return 0;
|
|
my $att_name = $self->Name($reader) || return 0;
|
|
$self->skip_whitespace($reader) ||
|
|
$self->parser_error("No whitespace after Name in attribute definition", $reader);
|
|
my $att_type = $self->AttType($reader);
|
|
|
|
$self->skip_whitespace($reader) ||
|
|
$self->parser_error("No whitespace after AttType in attribute definition", $reader);
|
|
my ($mode, $value) = $self->DefaultDecl($reader);
|
|
|
|
# fire SAX event here!
|
|
$self->attribute_decl({
|
|
eName => $el_name,
|
|
aName => $att_name,
|
|
Type => $att_type,
|
|
Mode => $mode,
|
|
Value => $value,
|
|
});
|
|
return 1;
|
|
}
|
|
|
|
sub AttType {
|
|
my ($self, $reader) = @_;
|
|
|
|
return $self->StringType($reader) ||
|
|
$self->TokenizedType($reader) ||
|
|
$self->EnumeratedType($reader) ||
|
|
$self->parser_error("Can't match AttType", $reader);
|
|
}
|
|
|
|
sub StringType {
|
|
my ($self, $reader) = @_;
|
|
|
|
my $data = $reader->data(5);
|
|
return unless $data =~ /^CDATA/;
|
|
$reader->move_along(5);
|
|
return 'CDATA';
|
|
}
|
|
|
|
sub TokenizedType {
|
|
my ($self, $reader) = @_;
|
|
|
|
my $data = $reader->data(8);
|
|
if ($data =~ /^(IDREFS?|ID|ENTITIES|ENTITY|NMTOKENS?)/) {
|
|
$reader->move_along(length($1));
|
|
return $1;
|
|
}
|
|
return;
|
|
}
|
|
|
|
sub EnumeratedType {
|
|
my ($self, $reader) = @_;
|
|
return $self->NotationType($reader) || $self->Enumeration($reader);
|
|
}
|
|
|
|
sub NotationType {
|
|
my ($self, $reader) = @_;
|
|
|
|
my $data = $reader->data(8);
|
|
return unless $data =~ /^NOTATION/;
|
|
$reader->move_along(8);
|
|
|
|
$self->skip_whitespace($reader) ||
|
|
$self->parser_error("No whitespace after NOTATION", $reader);
|
|
$reader->match('(') or $self->parser_error("No opening bracket in notation section", $reader);
|
|
|
|
$self->skip_whitespace($reader);
|
|
my $model = 'NOTATION (';
|
|
my $name = $self->Name($reader) ||
|
|
$self->parser_error("No name in notation section", $reader);
|
|
$model .= $name;
|
|
$self->skip_whitespace($reader);
|
|
$data = $reader->data;
|
|
while ($data =~ /^\|/) {
|
|
$reader->move_along(1);
|
|
$model .= '|';
|
|
$self->skip_whitespace($reader);
|
|
my $name = $self->Name($reader) ||
|
|
$self->parser_error("No name in notation section", $reader);
|
|
$model .= $name;
|
|
$self->skip_whitespace($reader);
|
|
$data = $reader->data;
|
|
}
|
|
$data =~ /^\)/ or $self->parser_error("No closing bracket in notation section", $reader);
|
|
$reader->move_along(1);
|
|
|
|
$model .= ')';
|
|
|
|
return $model;
|
|
}
|
|
|
|
sub Enumeration {
|
|
my ($self, $reader) = @_;
|
|
|
|
return unless $reader->match('(');
|
|
|
|
$self->skip_whitespace($reader);
|
|
my $model = '(';
|
|
my $nmtoken = $self->Nmtoken($reader) ||
|
|
$self->parser_error("No Nmtoken in enumerated declaration", $reader);
|
|
$model .= $nmtoken;
|
|
$self->skip_whitespace($reader);
|
|
my $data = $reader->data;
|
|
while ($data =~ /^\|/) {
|
|
$model .= '|';
|
|
$reader->move_along(1);
|
|
$self->skip_whitespace($reader);
|
|
my $nmtoken = $self->Nmtoken($reader) ||
|
|
$self->parser_error("No Nmtoken in enumerated declaration", $reader);
|
|
$model .= $nmtoken;
|
|
$self->skip_whitespace($reader);
|
|
$data = $reader->data;
|
|
}
|
|
$data =~ /^\)/ or $self->parser_error("No closing bracket in enumerated declaration", $reader);
|
|
$reader->move_along(1);
|
|
|
|
$model .= ')';
|
|
|
|
return $model;
|
|
}
|
|
|
|
sub Nmtoken {
|
|
my ($self, $reader) = @_;
|
|
return $self->Name($reader);
|
|
}
|
|
|
|
sub DefaultDecl {
|
|
my ($self, $reader) = @_;
|
|
|
|
my $data = $reader->data(9);
|
|
if ($data =~ /^(\#REQUIRED|\#IMPLIED)/) {
|
|
$reader->move_along(length($1));
|
|
return $1;
|
|
}
|
|
my $model = '';
|
|
if ($data =~ /^\#FIXED/) {
|
|
$reader->move_along(6);
|
|
$self->skip_whitespace($reader) || $self->parser_error(
|
|
"no whitespace after FIXED specifier", $reader);
|
|
my $value = $self->AttValue($reader);
|
|
return "#FIXED", $value;
|
|
}
|
|
my $value = $self->AttValue($reader);
|
|
return undef, $value;
|
|
}
|
|
|
|
sub EntityDecl {
|
|
my ($self, $reader) = @_;
|
|
|
|
my $data = $reader->data(8);
|
|
return 0 unless $data =~ /^<!ENTITY/;
|
|
$reader->move_along(8);
|
|
|
|
$self->skip_whitespace($reader) || $self->parser_error(
|
|
"No whitespace after ENTITY declaration", $reader);
|
|
|
|
$self->PEDecl($reader) || $self->GEDecl($reader);
|
|
|
|
$self->skip_whitespace($reader);
|
|
|
|
$reader->match('>') or $self->parser_error("No closing '>' in entity definition", $reader);
|
|
|
|
return 1;
|
|
}
|
|
|
|
sub GEDecl {
|
|
my ($self, $reader) = @_;
|
|
|
|
my $name = $self->Name($reader) || $self->parser_error("No entity name given", $reader);
|
|
$self->skip_whitespace($reader) || $self->parser_error("No whitespace after entity name", $reader);
|
|
|
|
# TODO: ExternalID calls lexhandler method. Wrong place for it.
|
|
my $value;
|
|
if ($value = $self->ExternalID($reader)) {
|
|
$value .= $self->NDataDecl($reader);
|
|
}
|
|
else {
|
|
$value = $self->EntityValue($reader);
|
|
}
|
|
|
|
if ($self->{ParseOptions}{entities}{$name}) {
|
|
warn("entity $name already exists\n");
|
|
} else {
|
|
$self->{ParseOptions}{entities}{$name} = 1;
|
|
$self->{ParseOptions}{expanded_entity}{$name} = $value; # ???
|
|
}
|
|
# do callback?
|
|
return 1;
|
|
}
|
|
|
|
sub PEDecl {
|
|
my ($self, $reader) = @_;
|
|
|
|
return 0 unless $reader->match('%');
|
|
|
|
$self->skip_whitespace($reader) || $self->parser_error("No whitespace after parameter entity marker", $reader);
|
|
my $name = $self->Name($reader) || $self->parser_error("No parameter entity name given", $reader);
|
|
$self->skip_whitespace($reader) || $self->parser_error("No whitespace after parameter entity name", $reader);
|
|
my $value = $self->ExternalID($reader) ||
|
|
$self->EntityValue($reader) ||
|
|
$self->parser_error("PE is not a value or an external resource", $reader);
|
|
# do callback?
|
|
return 1;
|
|
}
|
|
|
|
my $quotre = qr/[^%&\"]/;
|
|
my $aposre = qr/[^%&\']/;
|
|
|
|
sub EntityValue {
|
|
my ($self, $reader) = @_;
|
|
|
|
my $data = $reader->data;
|
|
my $quote = '"';
|
|
my $re = $quotre;
|
|
if ($data !~ /^"/) {
|
|
$data =~ /^'/ or $self->parser_error("Not a quote character", $reader);
|
|
$quote = "'";
|
|
$re = $aposre;
|
|
}
|
|
$reader->move_along(1);
|
|
|
|
my $value = '';
|
|
|
|
while (1) {
|
|
my $data = $reader->data;
|
|
|
|
$self->parser_error("EOF found while reading entity value", $reader)
|
|
unless length($data);
|
|
|
|
if ($data =~ /^($re+)/) {
|
|
my $match = $1;
|
|
$value .= $match;
|
|
$reader->move_along(length($match));
|
|
}
|
|
elsif ($reader->match('&')) {
|
|
# if it's a char ref, expand now:
|
|
if ($reader->match('#')) {
|
|
my $char;
|
|
my $ref = '';
|
|
if ($reader->match('x')) {
|
|
my $data = $reader->data;
|
|
while (1) {
|
|
$self->parser_error("EOF looking for reference end", $reader)
|
|
unless length($data);
|
|
if ($data !~ /^([0-9a-fA-F]*)/) {
|
|
last;
|
|
}
|
|
$ref .= $1;
|
|
$reader->move_along(length($1));
|
|
if (length($1) == length($data)) {
|
|
$data = $reader->data;
|
|
}
|
|
else {
|
|
last;
|
|
}
|
|
}
|
|
$char = chr_ref(hex($ref));
|
|
$ref = "x$ref";
|
|
}
|
|
else {
|
|
my $data = $reader->data;
|
|
while (1) {
|
|
$self->parser_error("EOF looking for reference end", $reader)
|
|
unless length($data);
|
|
if ($data !~ /^([0-9]*)/) {
|
|
last;
|
|
}
|
|
$ref .= $1;
|
|
$reader->move_along(length($1));
|
|
if (length($1) == length($data)) {
|
|
$data = $reader->data;
|
|
}
|
|
else {
|
|
last;
|
|
}
|
|
}
|
|
$char = chr($ref);
|
|
}
|
|
$reader->match(';') ||
|
|
$self->parser_error("No semi-colon found after character reference", $reader);
|
|
if ($char !~ $SingleChar) { # match a single character
|
|
$self->parser_error("Character reference '&#$ref;' refers to an illegal XML character ($char)", $reader);
|
|
}
|
|
$value .= $char;
|
|
}
|
|
else {
|
|
# entity refs in entities get expanded later, so don't parse now.
|
|
$value .= '&';
|
|
}
|
|
}
|
|
elsif ($reader->match('%')) {
|
|
$value .= $self->PEReference($reader);
|
|
}
|
|
elsif ($reader->match($quote)) {
|
|
# end of attrib
|
|
last;
|
|
}
|
|
else {
|
|
$self->parser_error("Invalid character in attribute value: " . substr($reader->data, 0, 1), $reader);
|
|
}
|
|
}
|
|
|
|
return $value;
|
|
}
|
|
|
|
sub NDataDecl {
|
|
my ($self, $reader) = @_;
|
|
$self->skip_whitespace($reader) || return '';
|
|
my $data = $reader->data(5);
|
|
return '' unless $data =~ /^NDATA/;
|
|
$reader->move_along(5);
|
|
$self->skip_whitespace($reader) || $self->parser_error("No whitespace after NDATA declaration", $reader);
|
|
my $name = $self->Name($reader) || $self->parser_error("NDATA declaration lacks a proper Name", $reader);
|
|
return " NDATA $name";
|
|
}
|
|
|
|
sub NotationDecl {
|
|
my ($self, $reader) = @_;
|
|
|
|
my $data = $reader->data(10);
|
|
return 0 unless $data =~ /^<!NOTATION/;
|
|
$reader->move_along(10);
|
|
$self->skip_whitespace($reader) ||
|
|
$self->parser_error("No whitespace after NOTATION declaration", $reader);
|
|
$data = $reader->data;
|
|
my $value = '';
|
|
while(1) {
|
|
$self->parser_error("EOF found while looking for end of NotationDecl", $reader)
|
|
unless length($data);
|
|
|
|
if ($data =~ /^([^>]*)>/) {
|
|
$value .= $1;
|
|
$reader->move_along(length($1) + 1);
|
|
$self->notation_decl({Name => "FIXME", SystemId => "FIXME", PublicId => "FIXME" });
|
|
last;
|
|
}
|
|
else {
|
|
$value .= $data;
|
|
$reader->move_along(length($data));
|
|
$data = $reader->data;
|
|
}
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
1;
|