8986 lines
319 KiB
Perl
8986 lines
319 KiB
Perl
#####################################################################
|
||
#
|
||
# The Perl::Tidy::Tokenizer package is essentially a filter which
|
||
# reads lines of perl source code from a source object and provides
|
||
# corresponding tokenized lines through its get_line() method. Lines
|
||
# flow from the source_object to the caller like this:
|
||
#
|
||
# source_object --> LineBuffer_object --> Tokenizer --> calling routine
|
||
# get_line() get_line() get_line() line_of_tokens
|
||
#
|
||
# The source object can be any object with a get_line() method which
|
||
# supplies one line (a character string) perl call.
|
||
# The LineBuffer object is created by the Tokenizer.
|
||
# The Tokenizer returns a reference to a data structure 'line_of_tokens'
|
||
# containing one tokenized line for each call to its get_line() method.
|
||
#
|
||
# WARNING: This is not a real class. Only one tokenizer my be used.
|
||
#
|
||
########################################################################
|
||
|
||
package Perl::Tidy::Tokenizer;
|
||
use strict;
|
||
use warnings;
|
||
our $VERSION = '20210111';
|
||
|
||
use Perl::Tidy::LineBuffer;
|
||
use Carp;
|
||
|
||
# PACKAGE VARIABLES for processing an entire FILE.
|
||
# These must be package variables because most may get localized during
|
||
# processing. Most are initialized in sub prepare_for_a_new_file.
|
||
use vars qw{
|
||
$tokenizer_self
|
||
|
||
$last_nonblank_token
|
||
$last_nonblank_type
|
||
$last_nonblank_block_type
|
||
$statement_type
|
||
$in_attribute_list
|
||
$current_package
|
||
$context
|
||
|
||
%is_constant
|
||
%is_user_function
|
||
%user_function_prototype
|
||
%is_block_function
|
||
%is_block_list_function
|
||
%saw_function_definition
|
||
%saw_use_module
|
||
|
||
$brace_depth
|
||
$paren_depth
|
||
$square_bracket_depth
|
||
|
||
@current_depth
|
||
@total_depth
|
||
$total_depth
|
||
@nesting_sequence_number
|
||
@current_sequence_number
|
||
@paren_type
|
||
@paren_semicolon_count
|
||
@paren_structural_type
|
||
@brace_type
|
||
@brace_structural_type
|
||
@brace_context
|
||
@brace_package
|
||
@square_bracket_type
|
||
@square_bracket_structural_type
|
||
@depth_array
|
||
@nested_ternary_flag
|
||
@nested_statement_type
|
||
@starting_line_of_current_depth
|
||
};
|
||
|
||
# GLOBAL CONSTANTS for routines in this package,
|
||
# Initialized in a BEGIN block.
|
||
use vars qw{
|
||
%is_indirect_object_taker
|
||
%is_block_operator
|
||
%expecting_operator_token
|
||
%expecting_operator_types
|
||
%expecting_term_types
|
||
%expecting_term_token
|
||
%is_digraph
|
||
%is_file_test_operator
|
||
%is_trigraph
|
||
%is_tetragraph
|
||
%is_valid_token_type
|
||
%is_keyword
|
||
%is_code_block_token
|
||
%really_want_term
|
||
@opening_brace_names
|
||
@closing_brace_names
|
||
%is_keyword_taking_list
|
||
%is_keyword_taking_optional_arg
|
||
%is_keyword_rejecting_slash_as_pattern_delimiter
|
||
%is_keyword_rejecting_question_as_pattern_delimiter
|
||
%is_q_qq_qw_qx_qr_s_y_tr_m
|
||
%is_sub
|
||
%is_package
|
||
%is_comma_question_colon
|
||
%other_line_endings
|
||
};
|
||
|
||
# possible values of operator_expected()
|
||
use constant TERM => -1;
|
||
use constant UNKNOWN => 0;
|
||
use constant OPERATOR => 1;
|
||
|
||
# possible values of context
|
||
use constant SCALAR_CONTEXT => -1;
|
||
use constant UNKNOWN_CONTEXT => 0;
|
||
use constant LIST_CONTEXT => 1;
|
||
|
||
# Maximum number of little messages; probably need not be changed.
|
||
use constant MAX_NAG_MESSAGES => 6;
|
||
|
||
BEGIN {
|
||
|
||
# Array index names for $self
|
||
my $i = 0;
|
||
use constant {
|
||
_rhere_target_list_ => $i++,
|
||
_in_here_doc_ => $i++,
|
||
_here_doc_target_ => $i++,
|
||
_here_quote_character_ => $i++,
|
||
_in_data_ => $i++,
|
||
_in_end_ => $i++,
|
||
_in_format_ => $i++,
|
||
_in_error_ => $i++,
|
||
_in_pod_ => $i++,
|
||
_in_attribute_list_ => $i++,
|
||
_in_quote_ => $i++,
|
||
_quote_target_ => $i++,
|
||
_line_start_quote_ => $i++,
|
||
_starting_level_ => $i++,
|
||
_know_starting_level_ => $i++,
|
||
_tabsize_ => $i++,
|
||
_indent_columns_ => $i++,
|
||
_look_for_hash_bang_ => $i++,
|
||
_trim_qw_ => $i++,
|
||
_continuation_indentation_ => $i++,
|
||
_outdent_labels_ => $i++,
|
||
_last_line_number_ => $i++,
|
||
_saw_perl_dash_P_ => $i++,
|
||
_saw_perl_dash_w_ => $i++,
|
||
_saw_use_strict_ => $i++,
|
||
_saw_v_string_ => $i++,
|
||
_hit_bug_ => $i++,
|
||
_look_for_autoloader_ => $i++,
|
||
_look_for_selfloader_ => $i++,
|
||
_saw_autoloader_ => $i++,
|
||
_saw_selfloader_ => $i++,
|
||
_saw_hash_bang_ => $i++,
|
||
_saw_end_ => $i++,
|
||
_saw_data_ => $i++,
|
||
_saw_negative_indentation_ => $i++,
|
||
_started_tokenizing_ => $i++,
|
||
_line_buffer_object_ => $i++,
|
||
_debugger_object_ => $i++,
|
||
_diagnostics_object_ => $i++,
|
||
_logger_object_ => $i++,
|
||
_unexpected_error_count_ => $i++,
|
||
_started_looking_for_here_target_at_ => $i++,
|
||
_nearly_matched_here_target_at_ => $i++,
|
||
_line_of_text_ => $i++,
|
||
_rlower_case_labels_at_ => $i++,
|
||
_extended_syntax_ => $i++,
|
||
_maximum_level_ => $i++,
|
||
_true_brace_error_count_ => $i++,
|
||
_rOpts_maximum_level_errors_ => $i++,
|
||
_rOpts_maximum_unexpected_errors_ => $i++,
|
||
_rOpts_logfile_ => $i++,
|
||
_rOpts_ => $i++,
|
||
};
|
||
}
|
||
|
||
{ ## closure for subs to count instances
|
||
|
||
# methods to count instances
|
||
my $_count = 0;
|
||
sub get_count { return $_count; }
|
||
sub _increment_count { return ++$_count }
|
||
sub _decrement_count { return --$_count }
|
||
}
|
||
|
||
sub DESTROY {
|
||
my $self = shift;
|
||
$self->_decrement_count();
|
||
return;
|
||
}
|
||
|
||
sub AUTOLOAD {
|
||
|
||
# Catch any undefined sub calls so that we are sure to get
|
||
# some diagnostic information. This sub should never be called
|
||
# except for a programming error.
|
||
our $AUTOLOAD;
|
||
return if ( $AUTOLOAD =~ /\bDESTROY$/ );
|
||
my ( $pkg, $fname, $lno ) = caller();
|
||
my $my_package = __PACKAGE__;
|
||
print STDERR <<EOM;
|
||
======================================================================
|
||
Error detected in package '$my_package', version $VERSION
|
||
Received unexpected AUTOLOAD call for sub '$AUTOLOAD'
|
||
Called from package: '$pkg'
|
||
Called from File '$fname' at line '$lno'
|
||
This error is probably due to a recent programming change
|
||
======================================================================
|
||
EOM
|
||
exit 1;
|
||
}
|
||
|
||
sub check_options {
|
||
|
||
# Check Tokenizer parameters
|
||
my $rOpts = shift;
|
||
|
||
%is_sub = ();
|
||
$is_sub{'sub'} = 1;
|
||
|
||
# Install any aliases to 'sub'
|
||
if ( $rOpts->{'sub-alias-list'} ) {
|
||
|
||
# Note that any 'sub-alias-list' has been preprocessed to
|
||
# be a trimmed, space-separated list which includes 'sub'
|
||
# for example, it might be 'sub method fun'
|
||
my @sub_alias_list = split /\s+/, $rOpts->{'sub-alias-list'};
|
||
foreach my $word (@sub_alias_list) {
|
||
$is_sub{$word} = 1;
|
||
}
|
||
}
|
||
return;
|
||
}
|
||
|
||
sub new {
|
||
|
||
my ( $class, @args ) = @_;
|
||
|
||
# Note: 'tabs' and 'indent_columns' are temporary and should be
|
||
# removed asap
|
||
my %defaults = (
|
||
source_object => undef,
|
||
debugger_object => undef,
|
||
diagnostics_object => undef,
|
||
logger_object => undef,
|
||
starting_level => undef,
|
||
indent_columns => 4,
|
||
tabsize => 8,
|
||
look_for_hash_bang => 0,
|
||
trim_qw => 1,
|
||
look_for_autoloader => 1,
|
||
look_for_selfloader => 1,
|
||
starting_line_number => 1,
|
||
extended_syntax => 0,
|
||
rOpts => {},
|
||
);
|
||
my %args = ( %defaults, @args );
|
||
|
||
# we are given an object with a get_line() method to supply source lines
|
||
my $source_object = $args{source_object};
|
||
my $rOpts = $args{rOpts};
|
||
|
||
# we create another object with a get_line() and peek_ahead() method
|
||
my $line_buffer_object = Perl::Tidy::LineBuffer->new($source_object);
|
||
|
||
# Tokenizer state data is as follows:
|
||
# _rhere_target_list_ reference to list of here-doc targets
|
||
# _here_doc_target_ the target string for a here document
|
||
# _here_quote_character_ the type of here-doc quoting (" ' ` or none)
|
||
# to determine if interpolation is done
|
||
# _quote_target_ character we seek if chasing a quote
|
||
# _line_start_quote_ line where we started looking for a long quote
|
||
# _in_here_doc_ flag indicating if we are in a here-doc
|
||
# _in_pod_ flag set if we are in pod documentation
|
||
# _in_error_ flag set if we saw severe error (binary in script)
|
||
# _in_data_ flag set if we are in __DATA__ section
|
||
# _in_end_ flag set if we are in __END__ section
|
||
# _in_format_ flag set if we are in a format description
|
||
# _in_attribute_list_ flag telling if we are looking for attributes
|
||
# _in_quote_ flag telling if we are chasing a quote
|
||
# _starting_level_ indentation level of first line
|
||
# _line_buffer_object_ object with get_line() method to supply source code
|
||
# _diagnostics_object_ place to write debugging information
|
||
# _unexpected_error_count_ error count used to limit output
|
||
# _lower_case_labels_at_ line numbers where lower case labels seen
|
||
# _hit_bug_ program bug detected
|
||
|
||
my $self = [];
|
||
$self->[_rhere_target_list_] = [];
|
||
$self->[_in_here_doc_] = 0;
|
||
$self->[_here_doc_target_] = "";
|
||
$self->[_here_quote_character_] = "";
|
||
$self->[_in_data_] = 0;
|
||
$self->[_in_end_] = 0;
|
||
$self->[_in_format_] = 0;
|
||
$self->[_in_error_] = 0;
|
||
$self->[_in_pod_] = 0;
|
||
$self->[_in_attribute_list_] = 0;
|
||
$self->[_in_quote_] = 0;
|
||
$self->[_quote_target_] = "";
|
||
$self->[_line_start_quote_] = -1;
|
||
$self->[_starting_level_] = $args{starting_level};
|
||
$self->[_know_starting_level_] = defined( $args{starting_level} );
|
||
$self->[_tabsize_] = $args{tabsize};
|
||
$self->[_indent_columns_] = $args{indent_columns};
|
||
$self->[_look_for_hash_bang_] = $args{look_for_hash_bang};
|
||
$self->[_trim_qw_] = $args{trim_qw};
|
||
$self->[_continuation_indentation_] = $args{continuation_indentation};
|
||
$self->[_outdent_labels_] = $args{outdent_labels};
|
||
$self->[_last_line_number_] = $args{starting_line_number} - 1;
|
||
$self->[_saw_perl_dash_P_] = 0;
|
||
$self->[_saw_perl_dash_w_] = 0;
|
||
$self->[_saw_use_strict_] = 0;
|
||
$self->[_saw_v_string_] = 0;
|
||
$self->[_hit_bug_] = 0;
|
||
$self->[_look_for_autoloader_] = $args{look_for_autoloader};
|
||
$self->[_look_for_selfloader_] = $args{look_for_selfloader};
|
||
$self->[_saw_autoloader_] = 0;
|
||
$self->[_saw_selfloader_] = 0;
|
||
$self->[_saw_hash_bang_] = 0;
|
||
$self->[_saw_end_] = 0;
|
||
$self->[_saw_data_] = 0;
|
||
$self->[_saw_negative_indentation_] = 0;
|
||
$self->[_started_tokenizing_] = 0;
|
||
$self->[_line_buffer_object_] = $line_buffer_object;
|
||
$self->[_debugger_object_] = $args{debugger_object};
|
||
$self->[_diagnostics_object_] = $args{diagnostics_object};
|
||
$self->[_logger_object_] = $args{logger_object};
|
||
$self->[_unexpected_error_count_] = 0;
|
||
$self->[_started_looking_for_here_target_at_] = 0;
|
||
$self->[_nearly_matched_here_target_at_] = undef;
|
||
$self->[_line_of_text_] = "";
|
||
$self->[_rlower_case_labels_at_] = undef;
|
||
$self->[_extended_syntax_] = $args{extended_syntax};
|
||
$self->[_maximum_level_] = 0;
|
||
$self->[_true_brace_error_count_] = 0;
|
||
$self->[_rOpts_maximum_level_errors_] = $rOpts->{'maximum-level-errors'};
|
||
$self->[_rOpts_maximum_unexpected_errors_] =
|
||
$rOpts->{'maximum-unexpected-errors'};
|
||
$self->[_rOpts_logfile_] = $rOpts->{'logfile'};
|
||
$self->[_rOpts_] = $rOpts;
|
||
bless $self, $class;
|
||
|
||
$tokenizer_self = $self;
|
||
|
||
prepare_for_a_new_file();
|
||
find_starting_indentation_level();
|
||
|
||
# This is not a full class yet, so die if an attempt is made to
|
||
# create more than one object.
|
||
|
||
if ( _increment_count() > 1 ) {
|
||
confess
|
||
"Attempt to create more than 1 object in $class, which is not a true class yet\n";
|
||
}
|
||
|
||
return $self;
|
||
|
||
}
|
||
|
||
# interface to Perl::Tidy::Logger routines
|
||
sub warning {
|
||
my $msg = shift;
|
||
my $logger_object = $tokenizer_self->[_logger_object_];
|
||
if ($logger_object) {
|
||
$logger_object->warning($msg);
|
||
}
|
||
return;
|
||
}
|
||
|
||
sub complain {
|
||
my $msg = shift;
|
||
my $logger_object = $tokenizer_self->[_logger_object_];
|
||
if ($logger_object) {
|
||
$logger_object->complain($msg);
|
||
}
|
||
return;
|
||
}
|
||
|
||
sub write_logfile_entry {
|
||
my $msg = shift;
|
||
my $logger_object = $tokenizer_self->[_logger_object_];
|
||
if ($logger_object) {
|
||
$logger_object->write_logfile_entry($msg);
|
||
}
|
||
return;
|
||
}
|
||
|
||
sub interrupt_logfile {
|
||
my $logger_object = $tokenizer_self->[_logger_object_];
|
||
if ($logger_object) {
|
||
$logger_object->interrupt_logfile();
|
||
}
|
||
return;
|
||
}
|
||
|
||
sub resume_logfile {
|
||
my $logger_object = $tokenizer_self->[_logger_object_];
|
||
if ($logger_object) {
|
||
$logger_object->resume_logfile();
|
||
}
|
||
return;
|
||
}
|
||
|
||
sub increment_brace_error {
|
||
my $logger_object = $tokenizer_self->[_logger_object_];
|
||
if ($logger_object) {
|
||
$logger_object->increment_brace_error();
|
||
}
|
||
return;
|
||
}
|
||
|
||
sub report_definite_bug {
|
||
$tokenizer_self->[_hit_bug_] = 1;
|
||
my $logger_object = $tokenizer_self->[_logger_object_];
|
||
if ($logger_object) {
|
||
$logger_object->report_definite_bug();
|
||
}
|
||
return;
|
||
}
|
||
|
||
sub brace_warning {
|
||
my $msg = shift;
|
||
my $logger_object = $tokenizer_self->[_logger_object_];
|
||
if ($logger_object) {
|
||
$logger_object->brace_warning($msg);
|
||
}
|
||
return;
|
||
}
|
||
|
||
sub get_saw_brace_error {
|
||
my $logger_object = $tokenizer_self->[_logger_object_];
|
||
if ($logger_object) {
|
||
return $logger_object->get_saw_brace_error();
|
||
}
|
||
else {
|
||
return 0;
|
||
}
|
||
}
|
||
|
||
sub get_unexpected_error_count {
|
||
my ($self) = @_;
|
||
return $self->[_unexpected_error_count_];
|
||
}
|
||
|
||
# interface to Perl::Tidy::Diagnostics routines
|
||
sub write_diagnostics {
|
||
my $msg = shift;
|
||
if ( $tokenizer_self->[_diagnostics_object_] ) {
|
||
$tokenizer_self->[_diagnostics_object_]->write_diagnostics($msg);
|
||
}
|
||
return;
|
||
}
|
||
|
||
sub get_maximum_level {
|
||
return $tokenizer_self->[_maximum_level_];
|
||
}
|
||
|
||
sub report_tokenization_errors {
|
||
|
||
my ($self) = @_;
|
||
|
||
# Report any tokenization errors and return a flag '$severe_error'.
|
||
# Set $severe_error = 1 if the tokenizations errors are so severe that
|
||
# the formatter should not attempt to format the file. Instead, it will
|
||
# just output the file verbatim.
|
||
|
||
# set severe error flag if tokenizer has encountered file reading problems
|
||
# (i.e. unexpected binary characters)
|
||
my $severe_error = $self->[_in_error_];
|
||
|
||
my $maxle = $self->[_rOpts_maximum_level_errors_];
|
||
my $maxue = $self->[_rOpts_maximum_unexpected_errors_];
|
||
$maxle = 1 unless defined($maxle);
|
||
$maxue = 0 unless defined($maxue);
|
||
|
||
my $level = get_indentation_level();
|
||
if ( $level != $tokenizer_self->[_starting_level_] ) {
|
||
warning("final indentation level: $level\n");
|
||
my $level_diff = $tokenizer_self->[_starting_level_] - $level;
|
||
if ( $level_diff < 0 ) { $level_diff = -$level_diff }
|
||
|
||
# Set severe error flag if the level error is greater than 1.
|
||
# The formatter can function for any level error but it is probably
|
||
# best not to attempt formatting for a high level error.
|
||
if ( $maxle >= 0 && $level_diff > $maxle ) {
|
||
$severe_error = 1;
|
||
warning(<<EOM);
|
||
Formatting will be skipped because level error '$level_diff' exceeds -maxle=$maxle; use -maxle=-1 to force formatting
|
||
EOM
|
||
}
|
||
}
|
||
|
||
check_final_nesting_depths();
|
||
|
||
# Likewise, large numbers of brace errors usually indicate non-perl
|
||
# scirpts, so set the severe error flag at a low number. This is similar
|
||
# to the level check, but different because braces may balance but be
|
||
# incorrectly interlaced.
|
||
if ( $tokenizer_self->[_true_brace_error_count_] > 2 ) {
|
||
$severe_error = 1;
|
||
}
|
||
|
||
if ( $tokenizer_self->[_look_for_hash_bang_]
|
||
&& !$tokenizer_self->[_saw_hash_bang_] )
|
||
{
|
||
warning(
|
||
"hit EOF without seeing hash-bang line; maybe don't need -x?\n");
|
||
}
|
||
|
||
if ( $tokenizer_self->[_in_format_] ) {
|
||
warning("hit EOF while in format description\n");
|
||
}
|
||
|
||
if ( $tokenizer_self->[_in_pod_] ) {
|
||
|
||
# Just write log entry if this is after __END__ or __DATA__
|
||
# because this happens to often, and it is not likely to be
|
||
# a parsing error.
|
||
if ( $tokenizer_self->[_saw_data_] || $tokenizer_self->[_saw_end_] ) {
|
||
write_logfile_entry(
|
||
"hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
|
||
);
|
||
}
|
||
|
||
else {
|
||
complain(
|
||
"hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
|
||
);
|
||
}
|
||
|
||
}
|
||
|
||
if ( $tokenizer_self->[_in_here_doc_] ) {
|
||
$severe_error = 1;
|
||
my $here_doc_target = $tokenizer_self->[_here_doc_target_];
|
||
my $started_looking_for_here_target_at =
|
||
$tokenizer_self->[_started_looking_for_here_target_at_];
|
||
if ($here_doc_target) {
|
||
warning(
|
||
"hit EOF in here document starting at line $started_looking_for_here_target_at with target: $here_doc_target\n"
|
||
);
|
||
}
|
||
else {
|
||
warning(
|
||
"hit EOF in here document starting at line $started_looking_for_here_target_at with empty target string\n"
|
||
);
|
||
}
|
||
my $nearly_matched_here_target_at =
|
||
$tokenizer_self->[_nearly_matched_here_target_at_];
|
||
if ($nearly_matched_here_target_at) {
|
||
warning(
|
||
"NOTE: almost matched at input line $nearly_matched_here_target_at except for whitespace\n"
|
||
);
|
||
}
|
||
}
|
||
|
||
# Something is seriously wrong if we ended inside a quote
|
||
if ( $tokenizer_self->[_in_quote_] ) {
|
||
$severe_error = 1;
|
||
my $line_start_quote = $tokenizer_self->[_line_start_quote_];
|
||
my $quote_target = $tokenizer_self->[_quote_target_];
|
||
my $what =
|
||
( $tokenizer_self->[_in_attribute_list_] )
|
||
? "attribute list"
|
||
: "quote/pattern";
|
||
warning(
|
||
"hit EOF seeking end of $what starting at line $line_start_quote ending in $quote_target\n"
|
||
);
|
||
}
|
||
|
||
if ( $tokenizer_self->[_hit_bug_] ) {
|
||
$severe_error = 1;
|
||
}
|
||
|
||
# Multiple "unexpected" type tokenization errors usually indicate parsing
|
||
# non-perl scripts, or that something is seriously wrong, so we should
|
||
# avoid formatting them. This can happen for example if we run perltidy on
|
||
# a shell script or an html file. But unfortunately this check can
|
||
# interfere with some extended syntaxes, such as RPerl, so it has to be off
|
||
# by default.
|
||
my $ue_count = $tokenizer_self->[_unexpected_error_count_];
|
||
if ( $maxue > 0 && $ue_count > $maxue ) {
|
||
warning(<<EOM);
|
||
Formatting will be skipped since unexpected token count = $ue_count > -maxue=$maxue; use -maxue=0 to force formatting
|
||
EOM
|
||
$severe_error = 1;
|
||
}
|
||
|
||
unless ( $tokenizer_self->[_saw_perl_dash_w_] ) {
|
||
if ( $] < 5.006 ) {
|
||
write_logfile_entry("Suggest including '-w parameter'\n");
|
||
}
|
||
else {
|
||
write_logfile_entry("Suggest including 'use warnings;'\n");
|
||
}
|
||
}
|
||
|
||
if ( $tokenizer_self->[_saw_perl_dash_P_] ) {
|
||
write_logfile_entry("Use of -P parameter for defines is discouraged\n");
|
||
}
|
||
|
||
unless ( $tokenizer_self->[_saw_use_strict_] ) {
|
||
write_logfile_entry("Suggest including 'use strict;'\n");
|
||
}
|
||
|
||
# it is suggested that labels have at least one upper case character
|
||
# for legibility and to avoid code breakage as new keywords are introduced
|
||
if ( $tokenizer_self->[_rlower_case_labels_at_] ) {
|
||
my @lower_case_labels_at =
|
||
@{ $tokenizer_self->[_rlower_case_labels_at_] };
|
||
write_logfile_entry(
|
||
"Suggest using upper case characters in label(s)\n");
|
||
local $" = ')(';
|
||
write_logfile_entry(" defined at line(s): (@lower_case_labels_at)\n");
|
||
}
|
||
return $severe_error;
|
||
}
|
||
|
||
sub report_v_string {
|
||
|
||
# warn if this version can't handle v-strings
|
||
my $tok = shift;
|
||
unless ( $tokenizer_self->[_saw_v_string_] ) {
|
||
$tokenizer_self->[_saw_v_string_] =
|
||
$tokenizer_self->[_last_line_number_];
|
||
}
|
||
if ( $] < 5.006 ) {
|
||
warning(
|
||
"Found v-string '$tok' but v-strings are not implemented in your version of perl; see Camel 3 book ch 2\n"
|
||
);
|
||
}
|
||
return;
|
||
}
|
||
|
||
sub get_input_line_number {
|
||
return $tokenizer_self->[_last_line_number_];
|
||
}
|
||
|
||
# returns the next tokenized line
|
||
sub get_line {
|
||
|
||
my $self = shift;
|
||
|
||
# USES GLOBAL VARIABLES: $tokenizer_self, $brace_depth,
|
||
# $square_bracket_depth, $paren_depth
|
||
|
||
my $input_line = $tokenizer_self->[_line_buffer_object_]->get_line();
|
||
$tokenizer_self->[_line_of_text_] = $input_line;
|
||
|
||
return unless ($input_line);
|
||
|
||
my $input_line_number = ++$tokenizer_self->[_last_line_number_];
|
||
|
||
# Find and remove what characters terminate this line, including any
|
||
# control r
|
||
my $input_line_separator = "";
|
||
if ( chomp($input_line) ) { $input_line_separator = $/ }
|
||
|
||
# The first test here very significantly speeds things up, but be sure to
|
||
# keep the regex and hash %other_line_endings the same.
|
||
if ( $other_line_endings{ substr( $input_line, -1 ) } ) {
|
||
if ( $input_line =~ s/((\r|\035|\032)+)$// ) {
|
||
$input_line_separator = $2 . $input_line_separator;
|
||
}
|
||
}
|
||
|
||
# for backwards compatibility we keep the line text terminated with
|
||
# a newline character
|
||
$input_line .= "\n";
|
||
$tokenizer_self->[_line_of_text_] = $input_line; # update
|
||
|
||
# create a data structure describing this line which will be
|
||
# returned to the caller.
|
||
|
||
# _line_type codes are:
|
||
# SYSTEM - system-specific code before hash-bang line
|
||
# CODE - line of perl code (including comments)
|
||
# POD_START - line starting pod, such as '=head'
|
||
# POD - pod documentation text
|
||
# POD_END - last line of pod section, '=cut'
|
||
# HERE - text of here-document
|
||
# HERE_END - last line of here-doc (target word)
|
||
# FORMAT - format section
|
||
# FORMAT_END - last line of format section, '.'
|
||
# DATA_START - __DATA__ line
|
||
# DATA - unidentified text following __DATA__
|
||
# END_START - __END__ line
|
||
# END - unidentified text following __END__
|
||
# ERROR - we are in big trouble, probably not a perl script
|
||
|
||
# Other variables:
|
||
# _curly_brace_depth - depth of curly braces at start of line
|
||
# _square_bracket_depth - depth of square brackets at start of line
|
||
# _paren_depth - depth of parens at start of line
|
||
# _starting_in_quote - this line continues a multi-line quote
|
||
# (so don't trim leading blanks!)
|
||
# _ending_in_quote - this line ends in a multi-line quote
|
||
# (so don't trim trailing blanks!)
|
||
my $line_of_tokens = {
|
||
_line_type => 'EOF',
|
||
_line_text => $input_line,
|
||
_line_number => $input_line_number,
|
||
_guessed_indentation_level => 0,
|
||
_curly_brace_depth => $brace_depth,
|
||
_square_bracket_depth => $square_bracket_depth,
|
||
_paren_depth => $paren_depth,
|
||
_quote_character => '',
|
||
## _rtoken_type => undef,
|
||
## _rtokens => undef,
|
||
## _rlevels => undef,
|
||
## _rslevels => undef,
|
||
## _rblock_type => undef,
|
||
## _rcontainer_type => undef,
|
||
## _rcontainer_environment => undef,
|
||
## _rtype_sequence => undef,
|
||
## _rnesting_tokens => undef,
|
||
## _rci_levels => undef,
|
||
## _rnesting_blocks => undef,
|
||
## _starting_in_quote => 0,
|
||
## _ending_in_quote => 0,
|
||
};
|
||
|
||
# must print line unchanged if we are in a here document
|
||
if ( $tokenizer_self->[_in_here_doc_] ) {
|
||
|
||
$line_of_tokens->{_line_type} = 'HERE';
|
||
my $here_doc_target = $tokenizer_self->[_here_doc_target_];
|
||
my $here_quote_character = $tokenizer_self->[_here_quote_character_];
|
||
my $candidate_target = $input_line;
|
||
chomp $candidate_target;
|
||
|
||
# Handle <<~ targets, which are indicated here by a leading space on
|
||
# the here quote character
|
||
if ( $here_quote_character =~ /^\s/ ) {
|
||
$candidate_target =~ s/^\s*//;
|
||
}
|
||
if ( $candidate_target eq $here_doc_target ) {
|
||
$tokenizer_self->[_nearly_matched_here_target_at_] = undef;
|
||
$line_of_tokens->{_line_type} = 'HERE_END';
|
||
write_logfile_entry("Exiting HERE document $here_doc_target\n");
|
||
|
||
my $rhere_target_list = $tokenizer_self->[_rhere_target_list_];
|
||
if ( @{$rhere_target_list} ) { # there can be multiple here targets
|
||
( $here_doc_target, $here_quote_character ) =
|
||
@{ shift @{$rhere_target_list} };
|
||
$tokenizer_self->[_here_doc_target_] = $here_doc_target;
|
||
$tokenizer_self->[_here_quote_character_] =
|
||
$here_quote_character;
|
||
write_logfile_entry(
|
||
"Entering HERE document $here_doc_target\n");
|
||
$tokenizer_self->[_nearly_matched_here_target_at_] = undef;
|
||
$tokenizer_self->[_started_looking_for_here_target_at_] =
|
||
$input_line_number;
|
||
}
|
||
else {
|
||
$tokenizer_self->[_in_here_doc_] = 0;
|
||
$tokenizer_self->[_here_doc_target_] = "";
|
||
$tokenizer_self->[_here_quote_character_] = "";
|
||
}
|
||
}
|
||
|
||
# check for error of extra whitespace
|
||
# note for PERL6: leading whitespace is allowed
|
||
else {
|
||
$candidate_target =~ s/\s*$//;
|
||
$candidate_target =~ s/^\s*//;
|
||
if ( $candidate_target eq $here_doc_target ) {
|
||
$tokenizer_self->[_nearly_matched_here_target_at_] =
|
||
$input_line_number;
|
||
}
|
||
}
|
||
return $line_of_tokens;
|
||
}
|
||
|
||
# must print line unchanged if we are in a format section
|
||
elsif ( $tokenizer_self->[_in_format_] ) {
|
||
|
||
if ( $input_line =~ /^\.[\s#]*$/ ) {
|
||
write_logfile_entry("Exiting format section\n");
|
||
$tokenizer_self->[_in_format_] = 0;
|
||
$line_of_tokens->{_line_type} = 'FORMAT_END';
|
||
}
|
||
else {
|
||
$line_of_tokens->{_line_type} = 'FORMAT';
|
||
}
|
||
return $line_of_tokens;
|
||
}
|
||
|
||
# must print line unchanged if we are in pod documentation
|
||
elsif ( $tokenizer_self->[_in_pod_] ) {
|
||
|
||
$line_of_tokens->{_line_type} = 'POD';
|
||
if ( $input_line =~ /^=cut/ ) {
|
||
$line_of_tokens->{_line_type} = 'POD_END';
|
||
write_logfile_entry("Exiting POD section\n");
|
||
$tokenizer_self->[_in_pod_] = 0;
|
||
}
|
||
if ( $input_line =~ /^\#\!.*perl\b/ && !$tokenizer_self->[_in_end_] ) {
|
||
warning(
|
||
"Hash-bang in pod can cause older versions of perl to fail! \n"
|
||
);
|
||
}
|
||
|
||
return $line_of_tokens;
|
||
}
|
||
|
||
# must print line unchanged if we have seen a severe error (i.e., we
|
||
# are seeing illegal tokens and cannot continue. Syntax errors do
|
||
# not pass this route). Calling routine can decide what to do, but
|
||
# the default can be to just pass all lines as if they were after __END__
|
||
elsif ( $tokenizer_self->[_in_error_] ) {
|
||
$line_of_tokens->{_line_type} = 'ERROR';
|
||
return $line_of_tokens;
|
||
}
|
||
|
||
# print line unchanged if we are __DATA__ section
|
||
elsif ( $tokenizer_self->[_in_data_] ) {
|
||
|
||
# ...but look for POD
|
||
# Note that the _in_data and _in_end flags remain set
|
||
# so that we return to that state after seeing the
|
||
# end of a pod section
|
||
if ( $input_line =~ /^=(\w+)\b/ && $1 ne 'cut' ) {
|
||
$line_of_tokens->{_line_type} = 'POD_START';
|
||
write_logfile_entry("Entering POD section\n");
|
||
$tokenizer_self->[_in_pod_] = 1;
|
||
return $line_of_tokens;
|
||
}
|
||
else {
|
||
$line_of_tokens->{_line_type} = 'DATA';
|
||
return $line_of_tokens;
|
||
}
|
||
}
|
||
|
||
# print line unchanged if we are in __END__ section
|
||
elsif ( $tokenizer_self->[_in_end_] ) {
|
||
|
||
# ...but look for POD
|
||
# Note that the _in_data and _in_end flags remain set
|
||
# so that we return to that state after seeing the
|
||
# end of a pod section
|
||
if ( $input_line =~ /^=(\w+)\b/ && $1 ne 'cut' ) {
|
||
$line_of_tokens->{_line_type} = 'POD_START';
|
||
write_logfile_entry("Entering POD section\n");
|
||
$tokenizer_self->[_in_pod_] = 1;
|
||
return $line_of_tokens;
|
||
}
|
||
else {
|
||
$line_of_tokens->{_line_type} = 'END';
|
||
return $line_of_tokens;
|
||
}
|
||
}
|
||
|
||
# check for a hash-bang line if we haven't seen one
|
||
if ( !$tokenizer_self->[_saw_hash_bang_] ) {
|
||
if ( $input_line =~ /^\#\!.*perl\b/ ) {
|
||
$tokenizer_self->[_saw_hash_bang_] = $input_line_number;
|
||
|
||
# check for -w and -P flags
|
||
if ( $input_line =~ /^\#\!.*perl\s.*-.*P/ ) {
|
||
$tokenizer_self->[_saw_perl_dash_P_] = 1;
|
||
}
|
||
|
||
if ( $input_line =~ /^\#\!.*perl\s.*-.*w/ ) {
|
||
$tokenizer_self->[_saw_perl_dash_w_] = 1;
|
||
}
|
||
|
||
if (
|
||
$input_line_number > 1
|
||
|
||
# leave any hash bang in a BEGIN block alone
|
||
# i.e. see 'debugger-duck_type.t'
|
||
&& !(
|
||
$last_nonblank_block_type
|
||
&& $last_nonblank_block_type eq 'BEGIN'
|
||
)
|
||
&& !$tokenizer_self->[_look_for_hash_bang_]
|
||
|
||
# Try to avoid giving a false alarm at a simple comment.
|
||
# These look like valid hash-bang lines:
|
||
|
||
#!/usr/bin/perl -w
|
||
#! /usr/bin/perl -w
|
||
#!c:\perl\bin\perl.exe
|
||
|
||
# These are comments:
|
||
#! I love perl
|
||
#! sunos does not yet provide a /usr/bin/perl
|
||
|
||
# Comments typically have multiple spaces, which suggests
|
||
# the filter
|
||
&& $input_line =~ /^\#\!(\s+)?(\S+)?perl/
|
||
)
|
||
{
|
||
|
||
# this is helpful for VMS systems; we may have accidentally
|
||
# tokenized some DCL commands
|
||
if ( $tokenizer_self->[_started_tokenizing_] ) {
|
||
warning(
|
||
"There seems to be a hash-bang after line 1; do you need to run with -x ?\n"
|
||
);
|
||
}
|
||
else {
|
||
complain("Useless hash-bang after line 1\n");
|
||
}
|
||
}
|
||
|
||
# Report the leading hash-bang as a system line
|
||
# This will prevent -dac from deleting it
|
||
else {
|
||
$line_of_tokens->{_line_type} = 'SYSTEM';
|
||
return $line_of_tokens;
|
||
}
|
||
}
|
||
}
|
||
|
||
# wait for a hash-bang before parsing if the user invoked us with -x
|
||
if ( $tokenizer_self->[_look_for_hash_bang_]
|
||
&& !$tokenizer_self->[_saw_hash_bang_] )
|
||
{
|
||
$line_of_tokens->{_line_type} = 'SYSTEM';
|
||
return $line_of_tokens;
|
||
}
|
||
|
||
# a first line of the form ': #' will be marked as SYSTEM
|
||
# since lines of this form may be used by tcsh
|
||
if ( $input_line_number == 1 && $input_line =~ /^\s*\:\s*\#/ ) {
|
||
$line_of_tokens->{_line_type} = 'SYSTEM';
|
||
return $line_of_tokens;
|
||
}
|
||
|
||
# now we know that it is ok to tokenize the line...
|
||
# the line tokenizer will modify any of these private variables:
|
||
# _rhere_target_list
|
||
# _in_data
|
||
# _in_end
|
||
# _in_format
|
||
# _in_error
|
||
# _in_pod
|
||
# _in_quote
|
||
my $ending_in_quote_last = $tokenizer_self->[_in_quote_];
|
||
tokenize_this_line($line_of_tokens);
|
||
|
||
# Now finish defining the return structure and return it
|
||
$line_of_tokens->{_ending_in_quote} = $tokenizer_self->[_in_quote_];
|
||
|
||
# handle severe error (binary data in script)
|
||
if ( $tokenizer_self->[_in_error_] ) {
|
||
$tokenizer_self->[_in_quote_] = 0; # to avoid any more messages
|
||
warning("Giving up after error\n");
|
||
$line_of_tokens->{_line_type} = 'ERROR';
|
||
reset_indentation_level(0); # avoid error messages
|
||
return $line_of_tokens;
|
||
}
|
||
|
||
# handle start of pod documentation
|
||
if ( $tokenizer_self->[_in_pod_] ) {
|
||
|
||
# This gets tricky..above a __DATA__ or __END__ section, perl
|
||
# accepts '=cut' as the start of pod section. But afterwards,
|
||
# only pod utilities see it and they may ignore an =cut without
|
||
# leading =head. In any case, this isn't good.
|
||
if ( $input_line =~ /^=cut\b/ ) {
|
||
if ( $tokenizer_self->[_saw_data_] || $tokenizer_self->[_saw_end_] )
|
||
{
|
||
complain("=cut while not in pod ignored\n");
|
||
$tokenizer_self->[_in_pod_] = 0;
|
||
$line_of_tokens->{_line_type} = 'POD_END';
|
||
}
|
||
else {
|
||
$line_of_tokens->{_line_type} = 'POD_START';
|
||
warning(
|
||
"=cut starts a pod section .. this can fool pod utilities.\n"
|
||
);
|
||
write_logfile_entry("Entering POD section\n");
|
||
}
|
||
}
|
||
|
||
else {
|
||
$line_of_tokens->{_line_type} = 'POD_START';
|
||
write_logfile_entry("Entering POD section\n");
|
||
}
|
||
|
||
return $line_of_tokens;
|
||
}
|
||
|
||
# Update indentation levels for log messages.
|
||
# Skip blank lines and also block comments, unless a logfile is requested.
|
||
# Note that _line_of_text_ is the input line but trimmed from left to right.
|
||
my $lot = $tokenizer_self->[_line_of_text_];
|
||
if ( $lot && ( $self->[_rOpts_logfile_] || substr( $lot, 0, 1 ) ne '#' ) ) {
|
||
my $rlevels = $line_of_tokens->{_rlevels};
|
||
$line_of_tokens->{_guessed_indentation_level} =
|
||
guess_old_indentation_level($input_line);
|
||
}
|
||
|
||
# see if this line contains here doc targets
|
||
my $rhere_target_list = $tokenizer_self->[_rhere_target_list_];
|
||
if ( @{$rhere_target_list} ) {
|
||
|
||
my ( $here_doc_target, $here_quote_character ) =
|
||
@{ shift @{$rhere_target_list} };
|
||
$tokenizer_self->[_in_here_doc_] = 1;
|
||
$tokenizer_self->[_here_doc_target_] = $here_doc_target;
|
||
$tokenizer_self->[_here_quote_character_] = $here_quote_character;
|
||
write_logfile_entry("Entering HERE document $here_doc_target\n");
|
||
$tokenizer_self->[_started_looking_for_here_target_at_] =
|
||
$input_line_number;
|
||
}
|
||
|
||
# NOTE: __END__ and __DATA__ statements are written unformatted
|
||
# because they can theoretically contain additional characters
|
||
# which are not tokenized (and cannot be read with <DATA> either!).
|
||
if ( $tokenizer_self->[_in_data_] ) {
|
||
$line_of_tokens->{_line_type} = 'DATA_START';
|
||
write_logfile_entry("Starting __DATA__ section\n");
|
||
$tokenizer_self->[_saw_data_] = 1;
|
||
|
||
# keep parsing after __DATA__ if use SelfLoader was seen
|
||
if ( $tokenizer_self->[_saw_selfloader_] ) {
|
||
$tokenizer_self->[_in_data_] = 0;
|
||
write_logfile_entry(
|
||
"SelfLoader seen, continuing; -nlsl deactivates\n");
|
||
}
|
||
|
||
return $line_of_tokens;
|
||
}
|
||
|
||
elsif ( $tokenizer_self->[_in_end_] ) {
|
||
$line_of_tokens->{_line_type} = 'END_START';
|
||
write_logfile_entry("Starting __END__ section\n");
|
||
$tokenizer_self->[_saw_end_] = 1;
|
||
|
||
# keep parsing after __END__ if use AutoLoader was seen
|
||
if ( $tokenizer_self->[_saw_autoloader_] ) {
|
||
$tokenizer_self->[_in_end_] = 0;
|
||
write_logfile_entry(
|
||
"AutoLoader seen, continuing; -nlal deactivates\n");
|
||
}
|
||
return $line_of_tokens;
|
||
}
|
||
|
||
# now, finally, we know that this line is type 'CODE'
|
||
$line_of_tokens->{_line_type} = 'CODE';
|
||
|
||
# remember if we have seen any real code
|
||
if ( !$tokenizer_self->[_started_tokenizing_]
|
||
&& $input_line !~ /^\s*$/
|
||
&& $input_line !~ /^\s*#/ )
|
||
{
|
||
$tokenizer_self->[_started_tokenizing_] = 1;
|
||
}
|
||
|
||
if ( $tokenizer_self->[_debugger_object_] ) {
|
||
$tokenizer_self->[_debugger_object_]
|
||
->write_debug_entry($line_of_tokens);
|
||
}
|
||
|
||
# Note: if keyword 'format' occurs in this line code, it is still CODE
|
||
# (keyword 'format' need not start a line)
|
||
if ( $tokenizer_self->[_in_format_] ) {
|
||
write_logfile_entry("Entering format section\n");
|
||
}
|
||
|
||
if ( $tokenizer_self->[_in_quote_]
|
||
and ( $tokenizer_self->[_line_start_quote_] < 0 ) )
|
||
{
|
||
|
||
#if ( ( my $quote_target = get_quote_target() ) !~ /^\s*$/ ) {
|
||
if ( ( my $quote_target = $tokenizer_self->[_quote_target_] ) !~
|
||
/^\s*$/ )
|
||
{
|
||
$tokenizer_self->[_line_start_quote_] = $input_line_number;
|
||
write_logfile_entry(
|
||
"Start multi-line quote or pattern ending in $quote_target\n");
|
||
}
|
||
}
|
||
elsif ( ( $tokenizer_self->[_line_start_quote_] >= 0 )
|
||
&& !$tokenizer_self->[_in_quote_] )
|
||
{
|
||
$tokenizer_self->[_line_start_quote_] = -1;
|
||
write_logfile_entry("End of multi-line quote or pattern\n");
|
||
}
|
||
|
||
# we are returning a line of CODE
|
||
return $line_of_tokens;
|
||
}
|
||
|
||
sub find_starting_indentation_level {
|
||
|
||
# We need to find the indentation level of the first line of the
|
||
# script being formatted. Often it will be zero for an entire file,
|
||
# but if we are formatting a local block of code (within an editor for
|
||
# example) it may not be zero. The user may specify this with the
|
||
# -sil=n parameter but normally doesn't so we have to guess.
|
||
#
|
||
# USES GLOBAL VARIABLES: $tokenizer_self
|
||
my $starting_level = 0;
|
||
|
||
# use value if given as parameter
|
||
if ( $tokenizer_self->[_know_starting_level_] ) {
|
||
$starting_level = $tokenizer_self->[_starting_level_];
|
||
}
|
||
|
||
# if we know there is a hash_bang line, the level must be zero
|
||
elsif ( $tokenizer_self->[_look_for_hash_bang_] ) {
|
||
$tokenizer_self->[_know_starting_level_] = 1;
|
||
}
|
||
|
||
# otherwise figure it out from the input file
|
||
else {
|
||
my $line;
|
||
my $i = 0;
|
||
|
||
# keep looking at lines until we find a hash bang or piece of code
|
||
my $msg = "";
|
||
while ( $line =
|
||
$tokenizer_self->[_line_buffer_object_]->peek_ahead( $i++ ) )
|
||
{
|
||
|
||
# if first line is #! then assume starting level is zero
|
||
if ( $i == 1 && $line =~ /^\#\!/ ) {
|
||
$starting_level = 0;
|
||
last;
|
||
}
|
||
next if ( $line =~ /^\s*#/ ); # skip past comments
|
||
next if ( $line =~ /^\s*$/ ); # skip past blank lines
|
||
$starting_level = guess_old_indentation_level($line);
|
||
last;
|
||
}
|
||
$msg = "Line $i implies starting-indentation-level = $starting_level\n";
|
||
write_logfile_entry("$msg");
|
||
}
|
||
$tokenizer_self->[_starting_level_] = $starting_level;
|
||
reset_indentation_level($starting_level);
|
||
return;
|
||
}
|
||
|
||
sub guess_old_indentation_level {
|
||
my ($line) = @_;
|
||
|
||
# Guess the indentation level of an input line.
|
||
#
|
||
# For the first line of code this result will define the starting
|
||
# indentation level. It will mainly be non-zero when perltidy is applied
|
||
# within an editor to a local block of code.
|
||
#
|
||
# This is an impossible task in general because we can't know what tabs
|
||
# meant for the old script and how many spaces were used for one
|
||
# indentation level in the given input script. For example it may have
|
||
# been previously formatted with -i=7 -et=3. But we can at least try to
|
||
# make sure that perltidy guesses correctly if it is applied repeatedly to
|
||
# a block of code within an editor, so that the block stays at the same
|
||
# level when perltidy is applied repeatedly.
|
||
#
|
||
# USES GLOBAL VARIABLES: $tokenizer_self
|
||
my $level = 0;
|
||
|
||
# find leading tabs, spaces, and any statement label
|
||
my $spaces = 0;
|
||
if ( $line =~ /^(\t+)?(\s+)?(\w+:[^:])?/ ) {
|
||
|
||
# If there are leading tabs, we use the tab scheme for this run, if
|
||
# any, so that the code will remain stable when editing.
|
||
if ($1) { $spaces += length($1) * $tokenizer_self->[_tabsize_] }
|
||
|
||
if ($2) { $spaces += length($2) }
|
||
|
||
# correct for outdented labels
|
||
if ( $3 && $tokenizer_self->[_outdent_labels_] ) {
|
||
$spaces += $tokenizer_self->[_continuation_indentation_];
|
||
}
|
||
}
|
||
|
||
# compute indentation using the value of -i for this run.
|
||
# If -i=0 is used for this run (which is possible) it doesn't matter
|
||
# what we do here but we'll guess that the old run used 4 spaces per level.
|
||
my $indent_columns = $tokenizer_self->[_indent_columns_];
|
||
$indent_columns = 4 if ( !$indent_columns );
|
||
$level = int( $spaces / $indent_columns );
|
||
return ($level);
|
||
}
|
||
|
||
# This is a currently unused debug routine
|
||
sub dump_functions {
|
||
|
||
my $fh = *STDOUT;
|
||
foreach my $pkg ( keys %is_user_function ) {
|
||
$fh->print("\nnon-constant subs in package $pkg\n");
|
||
|
||
foreach my $sub ( keys %{ $is_user_function{$pkg} } ) {
|
||
my $msg = "";
|
||
if ( $is_block_list_function{$pkg}{$sub} ) {
|
||
$msg = 'block_list';
|
||
}
|
||
|
||
if ( $is_block_function{$pkg}{$sub} ) {
|
||
$msg = 'block';
|
||
}
|
||
$fh->print("$sub $msg\n");
|
||
}
|
||
}
|
||
|
||
foreach my $pkg ( keys %is_constant ) {
|
||
$fh->print("\nconstants and constant subs in package $pkg\n");
|
||
|
||
foreach my $sub ( keys %{ $is_constant{$pkg} } ) {
|
||
$fh->print("$sub\n");
|
||
}
|
||
}
|
||
return;
|
||
}
|
||
|
||
sub prepare_for_a_new_file {
|
||
|
||
# previous tokens needed to determine what to expect next
|
||
$last_nonblank_token = ';'; # the only possible starting state which
|
||
$last_nonblank_type = ';'; # will make a leading brace a code block
|
||
$last_nonblank_block_type = '';
|
||
|
||
# scalars for remembering statement types across multiple lines
|
||
$statement_type = ''; # '' or 'use' or 'sub..' or 'case..'
|
||
$in_attribute_list = 0;
|
||
|
||
# scalars for remembering where we are in the file
|
||
$current_package = "main";
|
||
$context = UNKNOWN_CONTEXT;
|
||
|
||
# hashes used to remember function information
|
||
%is_constant = (); # user-defined constants
|
||
%is_user_function = (); # user-defined functions
|
||
%user_function_prototype = (); # their prototypes
|
||
%is_block_function = ();
|
||
%is_block_list_function = ();
|
||
%saw_function_definition = ();
|
||
%saw_use_module = ();
|
||
|
||
# variables used to track depths of various containers
|
||
# and report nesting errors
|
||
$paren_depth = 0;
|
||
$brace_depth = 0;
|
||
$square_bracket_depth = 0;
|
||
@current_depth = (0) x scalar @closing_brace_names;
|
||
$total_depth = 0;
|
||
@total_depth = ();
|
||
@nesting_sequence_number = ( 0 .. @closing_brace_names - 1 );
|
||
@current_sequence_number = ();
|
||
|
||
@paren_type = ();
|
||
@paren_semicolon_count = ();
|
||
@paren_structural_type = ();
|
||
@brace_type = ();
|
||
@brace_structural_type = ();
|
||
@brace_context = ();
|
||
@brace_package = ();
|
||
@square_bracket_type = ();
|
||
@square_bracket_structural_type = ();
|
||
@depth_array = ();
|
||
@nested_ternary_flag = ();
|
||
@nested_statement_type = ();
|
||
@starting_line_of_current_depth = ();
|
||
|
||
$paren_type[$paren_depth] = '';
|
||
$paren_semicolon_count[$paren_depth] = 0;
|
||
$paren_structural_type[$brace_depth] = '';
|
||
$brace_type[$brace_depth] = ';'; # identify opening brace as code block
|
||
$brace_structural_type[$brace_depth] = '';
|
||
$brace_context[$brace_depth] = UNKNOWN_CONTEXT;
|
||
$brace_package[$paren_depth] = $current_package;
|
||
$square_bracket_type[$square_bracket_depth] = '';
|
||
$square_bracket_structural_type[$square_bracket_depth] = '';
|
||
|
||
initialize_tokenizer_state();
|
||
return;
|
||
}
|
||
|
||
{ ## closure for sub tokenize_this_line
|
||
|
||
use constant BRACE => 0;
|
||
use constant SQUARE_BRACKET => 1;
|
||
use constant PAREN => 2;
|
||
use constant QUESTION_COLON => 3;
|
||
|
||
# TV1: scalars for processing one LINE.
|
||
# Re-initialized on each entry to sub tokenize_this_line.
|
||
my (
|
||
$block_type, $container_type, $expecting,
|
||
$i, $i_tok, $input_line,
|
||
$input_line_number, $last_nonblank_i, $max_token_index,
|
||
$next_tok, $next_type, $peeked_ahead,
|
||
$prototype, $rhere_target_list, $rtoken_map,
|
||
$rtoken_type, $rtokens, $tok,
|
||
$type, $type_sequence, $indent_flag,
|
||
);
|
||
|
||
# TV2: refs to ARRAYS for processing one LINE
|
||
# Re-initialized on each call.
|
||
my $routput_token_list = []; # stack of output token indexes
|
||
my $routput_token_type = []; # token types
|
||
my $routput_block_type = []; # types of code block
|
||
my $routput_container_type = []; # paren types, such as if, elsif, ..
|
||
my $routput_type_sequence = []; # nesting sequential number
|
||
my $routput_indent_flag = []; #
|
||
|
||
# TV3: SCALARS for quote variables. These are initialized with a
|
||
# subroutine call and continually updated as lines are processed.
|
||
my ( $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
|
||
$quoted_string_1, $quoted_string_2, $allowed_quote_modifiers, );
|
||
|
||
# TV4: SCALARS for multi-line identifiers and
|
||
# statements. These are initialized with a subroutine call
|
||
# and continually updated as lines are processed.
|
||
my ( $id_scan_state, $identifier, $want_paren, $indented_if_level );
|
||
|
||
# TV5: SCALARS for tracking indentation level.
|
||
# Initialized once and continually updated as lines are
|
||
# processed.
|
||
my (
|
||
$nesting_token_string, $nesting_type_string,
|
||
$nesting_block_string, $nesting_block_flag,
|
||
$nesting_list_string, $nesting_list_flag,
|
||
$ci_string_in_tokenizer, $continuation_string_in_tokenizer,
|
||
$in_statement_continuation, $level_in_tokenizer,
|
||
$slevel_in_tokenizer, $rslevel_stack,
|
||
);
|
||
|
||
# TV6: SCALARS for remembering several previous
|
||
# tokens. Initialized once and continually updated as
|
||
# lines are processed.
|
||
my (
|
||
$last_nonblank_container_type, $last_nonblank_type_sequence,
|
||
$last_last_nonblank_token, $last_last_nonblank_type,
|
||
$last_last_nonblank_block_type, $last_last_nonblank_container_type,
|
||
$last_last_nonblank_type_sequence, $last_nonblank_prototype,
|
||
);
|
||
|
||
# ----------------------------------------------------------------
|
||
# beginning of tokenizer variable access and manipulation routines
|
||
# ----------------------------------------------------------------
|
||
|
||
sub initialize_tokenizer_state {
|
||
|
||
# TV1: initialized on each call
|
||
# TV2: initialized on each call
|
||
# TV3:
|
||
$in_quote = 0;
|
||
$quote_type = 'Q';
|
||
$quote_character = "";
|
||
$quote_pos = 0;
|
||
$quote_depth = 0;
|
||
$quoted_string_1 = "";
|
||
$quoted_string_2 = "";
|
||
$allowed_quote_modifiers = "";
|
||
|
||
# TV4:
|
||
$id_scan_state = '';
|
||
$identifier = '';
|
||
$want_paren = "";
|
||
$indented_if_level = 0;
|
||
|
||
# TV5:
|
||
$nesting_token_string = "";
|
||
$nesting_type_string = "";
|
||
$nesting_block_string = '1'; # initially in a block
|
||
$nesting_block_flag = 1;
|
||
$nesting_list_string = '0'; # initially not in a list
|
||
$nesting_list_flag = 0; # initially not in a list
|
||
$ci_string_in_tokenizer = "";
|
||
$continuation_string_in_tokenizer = "0";
|
||
$in_statement_continuation = 0;
|
||
$level_in_tokenizer = 0;
|
||
$slevel_in_tokenizer = 0;
|
||
$rslevel_stack = [];
|
||
|
||
# TV6:
|
||
$last_nonblank_container_type = '';
|
||
$last_nonblank_type_sequence = '';
|
||
$last_last_nonblank_token = ';';
|
||
$last_last_nonblank_type = ';';
|
||
$last_last_nonblank_block_type = '';
|
||
$last_last_nonblank_container_type = '';
|
||
$last_last_nonblank_type_sequence = '';
|
||
$last_nonblank_prototype = "";
|
||
return;
|
||
}
|
||
|
||
sub save_tokenizer_state {
|
||
|
||
my $rTV1 = [
|
||
$block_type, $container_type, $expecting,
|
||
$i, $i_tok, $input_line,
|
||
$input_line_number, $last_nonblank_i, $max_token_index,
|
||
$next_tok, $next_type, $peeked_ahead,
|
||
$prototype, $rhere_target_list, $rtoken_map,
|
||
$rtoken_type, $rtokens, $tok,
|
||
$type, $type_sequence, $indent_flag,
|
||
];
|
||
|
||
my $rTV2 = [
|
||
$routput_token_list, $routput_token_type,
|
||
$routput_block_type, $routput_container_type,
|
||
$routput_type_sequence, $routput_indent_flag,
|
||
];
|
||
|
||
my $rTV3 = [
|
||
$in_quote, $quote_type,
|
||
$quote_character, $quote_pos,
|
||
$quote_depth, $quoted_string_1,
|
||
$quoted_string_2, $allowed_quote_modifiers,
|
||
];
|
||
|
||
my $rTV4 =
|
||
[ $id_scan_state, $identifier, $want_paren, $indented_if_level ];
|
||
|
||
my $rTV5 = [
|
||
$nesting_token_string, $nesting_type_string,
|
||
$nesting_block_string, $nesting_block_flag,
|
||
$nesting_list_string, $nesting_list_flag,
|
||
$ci_string_in_tokenizer, $continuation_string_in_tokenizer,
|
||
$in_statement_continuation, $level_in_tokenizer,
|
||
$slevel_in_tokenizer, $rslevel_stack,
|
||
];
|
||
|
||
my $rTV6 = [
|
||
$last_nonblank_container_type,
|
||
$last_nonblank_type_sequence,
|
||
$last_last_nonblank_token,
|
||
$last_last_nonblank_type,
|
||
$last_last_nonblank_block_type,
|
||
$last_last_nonblank_container_type,
|
||
$last_last_nonblank_type_sequence,
|
||
$last_nonblank_prototype,
|
||
];
|
||
return [ $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ];
|
||
}
|
||
|
||
sub restore_tokenizer_state {
|
||
my ($rstate) = @_;
|
||
my ( $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ) = @{$rstate};
|
||
(
|
||
$block_type, $container_type, $expecting,
|
||
$i, $i_tok, $input_line,
|
||
$input_line_number, $last_nonblank_i, $max_token_index,
|
||
$next_tok, $next_type, $peeked_ahead,
|
||
$prototype, $rhere_target_list, $rtoken_map,
|
||
$rtoken_type, $rtokens, $tok,
|
||
$type, $type_sequence, $indent_flag,
|
||
) = @{$rTV1};
|
||
|
||
(
|
||
$routput_token_list, $routput_token_type,
|
||
$routput_block_type, $routput_container_type,
|
||
$routput_type_sequence, $routput_type_sequence,
|
||
) = @{$rTV2};
|
||
|
||
(
|
||
$in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
|
||
$quoted_string_1, $quoted_string_2, $allowed_quote_modifiers,
|
||
) = @{$rTV3};
|
||
|
||
( $id_scan_state, $identifier, $want_paren, $indented_if_level ) =
|
||
@{$rTV4};
|
||
|
||
(
|
||
$nesting_token_string, $nesting_type_string,
|
||
$nesting_block_string, $nesting_block_flag,
|
||
$nesting_list_string, $nesting_list_flag,
|
||
$ci_string_in_tokenizer, $continuation_string_in_tokenizer,
|
||
$in_statement_continuation, $level_in_tokenizer,
|
||
$slevel_in_tokenizer, $rslevel_stack,
|
||
) = @{$rTV5};
|
||
|
||
(
|
||
$last_nonblank_container_type,
|
||
$last_nonblank_type_sequence,
|
||
$last_last_nonblank_token,
|
||
$last_last_nonblank_type,
|
||
$last_last_nonblank_block_type,
|
||
$last_last_nonblank_container_type,
|
||
$last_last_nonblank_type_sequence,
|
||
$last_nonblank_prototype,
|
||
) = @{$rTV6};
|
||
return;
|
||
}
|
||
|
||
sub get_indentation_level {
|
||
|
||
# patch to avoid reporting error if indented if is not terminated
|
||
if ($indented_if_level) { return $level_in_tokenizer - 1 }
|
||
return $level_in_tokenizer;
|
||
}
|
||
|
||
sub reset_indentation_level {
|
||
$level_in_tokenizer = $slevel_in_tokenizer = shift;
|
||
push @{$rslevel_stack}, $slevel_in_tokenizer;
|
||
return;
|
||
}
|
||
|
||
sub peeked_ahead {
|
||
my $flag = shift;
|
||
$peeked_ahead = defined($flag) ? $flag : $peeked_ahead;
|
||
return $peeked_ahead;
|
||
}
|
||
|
||
# ------------------------------------------------------------
|
||
# end of tokenizer variable access and manipulation routines
|
||
# ------------------------------------------------------------
|
||
|
||
# ------------------------------------------------------------
|
||
# beginning of various scanner interface routines
|
||
# ------------------------------------------------------------
|
||
sub scan_replacement_text {
|
||
|
||
# check for here-docs in replacement text invoked by
|
||
# a substitution operator with executable modifier 'e'.
|
||
#
|
||
# given:
|
||
# $replacement_text
|
||
# return:
|
||
# $rht = reference to any here-doc targets
|
||
my ($replacement_text) = @_;
|
||
|
||
# quick check
|
||
return unless ( $replacement_text =~ /<</ );
|
||
|
||
write_logfile_entry("scanning replacement text for here-doc targets\n");
|
||
|
||
# save the logger object for error messages
|
||
my $logger_object = $tokenizer_self->[_logger_object_];
|
||
|
||
# localize all package variables
|
||
local (
|
||
$tokenizer_self, $last_nonblank_token,
|
||
$last_nonblank_type, $last_nonblank_block_type,
|
||
$statement_type, $in_attribute_list,
|
||
$current_package, $context,
|
||
%is_constant, %is_user_function,
|
||
%user_function_prototype, %is_block_function,
|
||
%is_block_list_function, %saw_function_definition,
|
||
$brace_depth, $paren_depth,
|
||
$square_bracket_depth, @current_depth,
|
||
@total_depth, $total_depth,
|
||
@nesting_sequence_number, @current_sequence_number,
|
||
@paren_type, @paren_semicolon_count,
|
||
@paren_structural_type, @brace_type,
|
||
@brace_structural_type, @brace_context,
|
||
@brace_package, @square_bracket_type,
|
||
@square_bracket_structural_type, @depth_array,
|
||
@starting_line_of_current_depth, @nested_ternary_flag,
|
||
@nested_statement_type,
|
||
);
|
||
|
||
# save all lexical variables
|
||
my $rstate = save_tokenizer_state();
|
||
_decrement_count(); # avoid error check for multiple tokenizers
|
||
|
||
# make a new tokenizer
|
||
my $rOpts = {};
|
||
my $rpending_logfile_message;
|
||
my $source_object = Perl::Tidy::LineSource->new(
|
||
input_file => \$replacement_text,
|
||
rOpts => $rOpts,
|
||
rpending_logfile_message => $rpending_logfile_message,
|
||
);
|
||
my $tokenizer = Perl::Tidy::Tokenizer->new(
|
||
source_object => $source_object,
|
||
logger_object => $logger_object,
|
||
starting_line_number => $input_line_number,
|
||
);
|
||
|
||
# scan the replacement text
|
||
1 while ( $tokenizer->get_line() );
|
||
|
||
# remove any here doc targets
|
||
my $rht = undef;
|
||
if ( $tokenizer_self->[_in_here_doc_] ) {
|
||
$rht = [];
|
||
push @{$rht},
|
||
[
|
||
$tokenizer_self->[_here_doc_target_],
|
||
$tokenizer_self->[_here_quote_character_]
|
||
];
|
||
if ( $tokenizer_self->[_rhere_target_list_] ) {
|
||
push @{$rht}, @{ $tokenizer_self->[_rhere_target_list_] };
|
||
$tokenizer_self->[_rhere_target_list_] = undef;
|
||
}
|
||
$tokenizer_self->[_in_here_doc_] = undef;
|
||
}
|
||
|
||
# now its safe to report errors
|
||
my $severe_error = $tokenizer->report_tokenization_errors();
|
||
|
||
# TODO: Could propagate a severe error up
|
||
|
||
# restore all tokenizer lexical variables
|
||
restore_tokenizer_state($rstate);
|
||
|
||
# return the here doc targets
|
||
return $rht;
|
||
}
|
||
|
||
sub scan_bare_identifier {
|
||
( $i, $tok, $type, $prototype ) =
|
||
scan_bare_identifier_do( $input_line, $i, $tok, $type, $prototype,
|
||
$rtoken_map, $max_token_index );
|
||
return;
|
||
}
|
||
|
||
sub scan_identifier {
|
||
( $i, $tok, $type, $id_scan_state, $identifier ) =
|
||
scan_identifier_do( $i, $id_scan_state, $identifier, $rtokens,
|
||
$max_token_index, $expecting, $paren_type[$paren_depth] );
|
||
return;
|
||
}
|
||
|
||
use constant VERIFY_FASTSCAN => 0;
|
||
my %fast_scan_context;
|
||
|
||
BEGIN {
|
||
%fast_scan_context = (
|
||
'$' => SCALAR_CONTEXT,
|
||
'*' => SCALAR_CONTEXT,
|
||
'@' => LIST_CONTEXT,
|
||
'%' => LIST_CONTEXT,
|
||
'&' => UNKNOWN_CONTEXT,
|
||
);
|
||
}
|
||
|
||
sub scan_identifier_fast {
|
||
|
||
# This is a wrapper for sub scan_identifier. It does a fast preliminary
|
||
# scan for certain common identifiers:
|
||
# '$var', '@var', %var, *var, &var, '@{...}', '%{...}'
|
||
# If it does not find one of these, or this is a restart, it calls the
|
||
# original scanner directly.
|
||
|
||
# This gives the same results as the full scanner in about 1/4 the
|
||
# total runtime for a typical input stream.
|
||
|
||
my $i_begin = $i;
|
||
my $tok_begin = $tok;
|
||
my $fast_scan_type;
|
||
|
||
###############################
|
||
# quick scan with leading sigil
|
||
###############################
|
||
if ( !$id_scan_state
|
||
&& $i + 1 <= $max_token_index
|
||
&& $fast_scan_context{$tok} )
|
||
{
|
||
$context = $fast_scan_context{$tok};
|
||
|
||
# look for $var, @var, ...
|
||
if ( $rtoken_type->[ $i + 1 ] eq 'w' ) {
|
||
my $pretype_next = "";
|
||
my $i_next = $i + 2;
|
||
if ( $i_next <= $max_token_index ) {
|
||
if ( $rtoken_type->[$i_next] eq 'b'
|
||
&& $i_next < $max_token_index )
|
||
{
|
||
$i_next += 1;
|
||
}
|
||
$pretype_next = $rtoken_type->[$i_next];
|
||
}
|
||
if ( $pretype_next ne ':' && $pretype_next ne "'" ) {
|
||
|
||
# Found type 'i' like '$var', '@var', or '%var'
|
||
$identifier = $tok . $rtokens->[ $i + 1 ];
|
||
$tok = $identifier;
|
||
$type = 'i';
|
||
$i = $i + 1;
|
||
$fast_scan_type = $type;
|
||
}
|
||
}
|
||
|
||
# Look for @{ or %{ .
|
||
# But we must let the full scanner handle things ${ because it may
|
||
# keep going to get a complete identifier like '${#}' .
|
||
elsif (
|
||
$rtoken_type->[ $i + 1 ] eq '{'
|
||
&& ( $tok_begin eq '@'
|
||
|| $tok_begin eq '%' )
|
||
)
|
||
{
|
||
|
||
$identifier = $tok;
|
||
$type = 't';
|
||
$fast_scan_type = $type;
|
||
}
|
||
}
|
||
|
||
############################
|
||
# Quick scan with leading ->
|
||
# Look for ->[ and ->{
|
||
############################
|
||
elsif (
|
||
$tok eq '->'
|
||
&& $i < $max_token_index
|
||
&& ( $rtokens->[ $i + 1 ] eq '{'
|
||
|| $rtokens->[ $i + 1 ] eq '[' )
|
||
)
|
||
{
|
||
$type = $tok;
|
||
$fast_scan_type = $type;
|
||
$identifier = $tok;
|
||
$context = UNKNOWN_CONTEXT;
|
||
}
|
||
|
||
#######################################
|
||
# Verify correctness during development
|
||
#######################################
|
||
if ( VERIFY_FASTSCAN && $fast_scan_type ) {
|
||
|
||
# We will call the full method
|
||
my $identifier_simple = $identifier;
|
||
my $tok_simple = $tok;
|
||
my $fast_scan_type = $type;
|
||
my $i_simple = $i;
|
||
my $context_simple = $context;
|
||
|
||
$tok = $tok_begin;
|
||
$i = $i_begin;
|
||
scan_identifier();
|
||
|
||
if ( $tok ne $tok_simple
|
||
|| $type ne $fast_scan_type
|
||
|| $i != $i_simple
|
||
|| $identifier ne $identifier_simple
|
||
|| $id_scan_state
|
||
|| $context ne $context_simple )
|
||
{
|
||
print STDERR <<EOM;
|
||
scan_identifier_fast differs from scan_identifier:
|
||
simple: i=$i_simple, tok=$tok_simple, type=$fast_scan_type, ident=$identifier_simple, context='$context_simple
|
||
full: i=$i, tok=$tok, type=$type, ident=$identifier, context='$context state=$id_scan_state
|
||
EOM
|
||
}
|
||
}
|
||
|
||
###################################################
|
||
# call full scanner if fast method did not succeed
|
||
###################################################
|
||
if ( !$fast_scan_type ) {
|
||
scan_identifier();
|
||
}
|
||
return;
|
||
}
|
||
|
||
sub scan_id {
|
||
( $i, $tok, $type, $id_scan_state ) =
|
||
scan_id_do( $input_line, $i, $tok, $rtokens, $rtoken_map,
|
||
$id_scan_state, $max_token_index );
|
||
return;
|
||
}
|
||
|
||
sub scan_number {
|
||
my $number;
|
||
( $i, $type, $number ) =
|
||
scan_number_do( $input_line, $i, $rtoken_map, $type,
|
||
$max_token_index );
|
||
return $number;
|
||
}
|
||
|
||
use constant VERIFY_FASTNUM => 0;
|
||
|
||
sub scan_number_fast {
|
||
|
||
# This is a wrapper for sub scan_number. It does a fast preliminary
|
||
# scan for a simple integer. It calls the original scan_number if it
|
||
# does not find one.
|
||
|
||
my $i_begin = $i;
|
||
my $tok_begin = $tok;
|
||
my $number;
|
||
|
||
##################################
|
||
# Quick check for (signed) integer
|
||
##################################
|
||
|
||
# This will be the string of digits:
|
||
my $i_d = $i;
|
||
my $tok_d = $tok;
|
||
my $typ_d = $rtoken_type->[$i_d];
|
||
|
||
# check for signed integer
|
||
my $sign = "";
|
||
if ( $typ_d ne 'd'
|
||
&& ( $typ_d eq '+' || $typ_d eq '-' )
|
||
&& $i_d < $max_token_index )
|
||
{
|
||
$sign = $tok_d;
|
||
$i_d++;
|
||
$tok_d = $rtokens->[$i_d];
|
||
$typ_d = $rtoken_type->[$i_d];
|
||
}
|
||
|
||
# Handle integers
|
||
if (
|
||
$typ_d eq 'd'
|
||
&& (
|
||
$i_d == $max_token_index
|
||
|| ( $i_d < $max_token_index
|
||
&& $rtoken_type->[ $i_d + 1 ] ne '.'
|
||
&& $rtoken_type->[ $i_d + 1 ] ne 'w' )
|
||
)
|
||
)
|
||
{
|
||
# Let let full scanner handle multi-digit integers beginning with
|
||
# '0' because there could be error messages. For example, '009' is
|
||
# not a valid number.
|
||
|
||
if ( $tok_d eq '0' || substr( $tok_d, 0, 1 ) ne '0' ) {
|
||
$number = $sign . $tok_d;
|
||
$type = 'n';
|
||
$i = $i_d;
|
||
}
|
||
}
|
||
|
||
#######################################
|
||
# Verify correctness during development
|
||
#######################################
|
||
if ( VERIFY_FASTNUM && defined($number) ) {
|
||
|
||
# We will call the full method
|
||
my $type_simple = $type;
|
||
my $i_simple = $i;
|
||
my $number_simple = $number;
|
||
|
||
$tok = $tok_begin;
|
||
$i = $i_begin;
|
||
$number = scan_number();
|
||
|
||
if ( $type ne $type_simple
|
||
|| ( $i != $i_simple && $i <= $max_token_index )
|
||
|| $number ne $number_simple )
|
||
{
|
||
print STDERR <<EOM;
|
||
scan_number_fast differs from scan_number:
|
||
simple: i=$i_simple, type=$type_simple, number=$number_simple
|
||
full: i=$i, type=$type, number=$number
|
||
EOM
|
||
}
|
||
}
|
||
|
||
#########################################
|
||
# call full scanner if may not be integer
|
||
#########################################
|
||
if ( !defined($number) ) {
|
||
$number = scan_number();
|
||
}
|
||
return $number;
|
||
}
|
||
|
||
# a sub to warn if token found where term expected
|
||
sub error_if_expecting_TERM {
|
||
if ( $expecting == TERM ) {
|
||
if ( $really_want_term{$last_nonblank_type} ) {
|
||
report_unexpected( $tok, "term", $i_tok, $last_nonblank_i,
|
||
$rtoken_map, $rtoken_type, $input_line );
|
||
return 1;
|
||
}
|
||
}
|
||
return;
|
||
}
|
||
|
||
# a sub to warn if token found where operator expected
|
||
sub error_if_expecting_OPERATOR {
|
||
my $thing = shift;
|
||
if ( $expecting == OPERATOR ) {
|
||
if ( !defined($thing) ) { $thing = $tok }
|
||
report_unexpected( $thing, "operator", $i_tok, $last_nonblank_i,
|
||
$rtoken_map, $rtoken_type, $input_line );
|
||
if ( $i_tok == 0 ) {
|
||
interrupt_logfile();
|
||
warning("Missing ';' or ',' above?\n");
|
||
resume_logfile();
|
||
}
|
||
return 1;
|
||
}
|
||
return;
|
||
}
|
||
|
||
# ------------------------------------------------------------
|
||
# end scanner interfaces
|
||
# ------------------------------------------------------------
|
||
|
||
my %is_for_foreach;
|
||
@_ = qw(for foreach);
|
||
@is_for_foreach{@_} = (1) x scalar(@_);
|
||
|
||
my %is_my_our_state;
|
||
@_ = qw(my our state);
|
||
@is_my_our_state{@_} = (1) x scalar(@_);
|
||
|
||
# These keywords may introduce blocks after parenthesized expressions,
|
||
# in the form:
|
||
# keyword ( .... ) { BLOCK }
|
||
# patch for SWITCH/CASE: added 'switch' 'case' 'given' 'when'
|
||
my %is_blocktype_with_paren;
|
||
@_ =
|
||
qw(if elsif unless while until for foreach switch case given when catch);
|
||
@is_blocktype_with_paren{@_} = (1) x scalar(@_);
|
||
|
||
my %is_case_default;
|
||
@_ = qw(case default);
|
||
@is_case_default{@_} = (1) x scalar(@_);
|
||
|
||
# ------------------------------------------------------------
|
||
# begin hash of code for handling most token types
|
||
# ------------------------------------------------------------
|
||
my $tokenization_code = {
|
||
|
||
# no special code for these types yet, but syntax checks
|
||
# could be added
|
||
|
||
## '!' => undef,
|
||
## '!=' => undef,
|
||
## '!~' => undef,
|
||
## '%=' => undef,
|
||
## '&&=' => undef,
|
||
## '&=' => undef,
|
||
## '+=' => undef,
|
||
## '-=' => undef,
|
||
## '..' => undef,
|
||
## '..' => undef,
|
||
## '...' => undef,
|
||
## '.=' => undef,
|
||
## '<<=' => undef,
|
||
## '<=' => undef,
|
||
## '<=>' => undef,
|
||
## '<>' => undef,
|
||
## '=' => undef,
|
||
## '==' => undef,
|
||
## '=~' => undef,
|
||
## '>=' => undef,
|
||
## '>>' => undef,
|
||
## '>>=' => undef,
|
||
## '\\' => undef,
|
||
## '^=' => undef,
|
||
## '|=' => undef,
|
||
## '||=' => undef,
|
||
## '//=' => undef,
|
||
## '~' => undef,
|
||
## '~~' => undef,
|
||
## '!~~' => undef,
|
||
|
||
'>' => sub {
|
||
error_if_expecting_TERM()
|
||
if ( $expecting == TERM );
|
||
},
|
||
'|' => sub {
|
||
error_if_expecting_TERM()
|
||
if ( $expecting == TERM );
|
||
},
|
||
'$' => sub {
|
||
|
||
# start looking for a scalar
|
||
error_if_expecting_OPERATOR("Scalar")
|
||
if ( $expecting == OPERATOR );
|
||
scan_identifier_fast();
|
||
|
||
if ( $identifier eq '$^W' ) {
|
||
$tokenizer_self->[_saw_perl_dash_w_] = 1;
|
||
}
|
||
|
||
# Check for identifier in indirect object slot
|
||
# (vorboard.pl, sort.t). Something like:
|
||
# /^(print|printf|sort|exec|system)$/
|
||
if (
|
||
$is_indirect_object_taker{$last_nonblank_token}
|
||
|| ( ( $last_nonblank_token eq '(' )
|
||
&& $is_indirect_object_taker{ $paren_type[$paren_depth] } )
|
||
|| ( $last_nonblank_type eq 'w'
|
||
|| $last_nonblank_type eq 'U' ) # possible object
|
||
)
|
||
{
|
||
$type = 'Z';
|
||
}
|
||
},
|
||
'(' => sub {
|
||
|
||
++$paren_depth;
|
||
$paren_semicolon_count[$paren_depth] = 0;
|
||
if ($want_paren) {
|
||
$container_type = $want_paren;
|
||
$want_paren = "";
|
||
}
|
||
elsif ( $statement_type =~ /^sub\b/ ) {
|
||
$container_type = $statement_type;
|
||
}
|
||
else {
|
||
$container_type = $last_nonblank_token;
|
||
|
||
# We can check for a syntax error here of unexpected '(',
|
||
# but this is going to get messy...
|
||
if (
|
||
$expecting == OPERATOR
|
||
|
||
# be sure this is not a method call of the form
|
||
# &method(...), $method->(..), &{method}(...),
|
||
# $ref[2](list) is ok & short for $ref[2]->(list)
|
||
# NOTE: at present, braces in something like &{ xxx }
|
||
# are not marked as a block, we might have a method call
|
||
&& $last_nonblank_token !~ /^([\]\}\&]|\-\>)/
|
||
|
||
)
|
||
{
|
||
|
||
# ref: camel 3 p 703.
|
||
if ( $last_last_nonblank_token eq 'do' ) {
|
||
complain(
|
||
"do SUBROUTINE is deprecated; consider & or -> notation\n"
|
||
);
|
||
}
|
||
else {
|
||
|
||
# if this is an empty list, (), then it is not an
|
||
# error; for example, we might have a constant pi and
|
||
# invoke it with pi() or just pi;
|
||
my ( $next_nonblank_token, $i_next ) =
|
||
find_next_nonblank_token( $i, $rtokens,
|
||
$max_token_index );
|
||
if ( $next_nonblank_token ne ')' ) {
|
||
my $hint;
|
||
|
||
# FIXME: this gives an error parsing something like
|
||
# $subsubs[0]()(0);
|
||
# which is a valid syntax (see subsub.t). We may
|
||
# need to revise this coding.
|
||
error_if_expecting_OPERATOR('(');
|
||
|
||
if ( $last_nonblank_type eq 'C' ) {
|
||
$hint =
|
||
"$last_nonblank_token has a void prototype\n";
|
||
}
|
||
elsif ( $last_nonblank_type eq 'i' ) {
|
||
if ( $i_tok > 0
|
||
&& $last_nonblank_token =~ /^\$/ )
|
||
{
|
||
$hint =
|
||
"Do you mean '$last_nonblank_token->(' ?\n";
|
||
}
|
||
}
|
||
if ($hint) {
|
||
interrupt_logfile();
|
||
warning($hint);
|
||
resume_logfile();
|
||
}
|
||
} ## end if ( $next_nonblank_token...
|
||
} ## end else [ if ( $last_last_nonblank_token...
|
||
} ## end if ( $expecting == OPERATOR...
|
||
}
|
||
$paren_type[$paren_depth] = $container_type;
|
||
( $type_sequence, $indent_flag ) =
|
||
increase_nesting_depth( PAREN, $rtoken_map->[$i_tok] );
|
||
|
||
# propagate types down through nested parens
|
||
# for example: the second paren in 'if ((' would be structural
|
||
# since the first is.
|
||
|
||
if ( $last_nonblank_token eq '(' ) {
|
||
$type = $last_nonblank_type;
|
||
}
|
||
|
||
# We exclude parens as structural after a ',' because it
|
||
# causes subtle problems with continuation indentation for
|
||
# something like this, where the first 'or' will not get
|
||
# indented.
|
||
#
|
||
# assert(
|
||
# __LINE__,
|
||
# ( not defined $check )
|
||
# or ref $check
|
||
# or $check eq "new"
|
||
# or $check eq "old",
|
||
# );
|
||
#
|
||
# Likewise, we exclude parens where a statement can start
|
||
# because of problems with continuation indentation, like
|
||
# these:
|
||
#
|
||
# ($firstline =~ /^#\!.*perl/)
|
||
# and (print $File::Find::name, "\n")
|
||
# and (return 1);
|
||
#
|
||
# (ref($usage_fref) =~ /CODE/)
|
||
# ? &$usage_fref
|
||
# : (&blast_usage, &blast_params, &blast_general_params);
|
||
|
||
else {
|
||
$type = '{';
|
||
}
|
||
|
||
if ( $last_nonblank_type eq ')' ) {
|
||
warning(
|
||
"Syntax error? found token '$last_nonblank_type' then '('\n"
|
||
);
|
||
}
|
||
$paren_structural_type[$paren_depth] = $type;
|
||
|
||
},
|
||
')' => sub {
|
||
( $type_sequence, $indent_flag ) =
|
||
decrease_nesting_depth( PAREN, $rtoken_map->[$i_tok] );
|
||
|
||
if ( $paren_structural_type[$paren_depth] eq '{' ) {
|
||
$type = '}';
|
||
}
|
||
|
||
$container_type = $paren_type[$paren_depth];
|
||
|
||
# restore statement type as 'sub' at closing paren of a signature
|
||
# so that a subsequent ':' is identified as an attribute
|
||
if ( $container_type =~ /^sub\b/ ) {
|
||
$statement_type = $container_type;
|
||
}
|
||
|
||
# /^(for|foreach)$/
|
||
if ( $is_for_foreach{ $paren_type[$paren_depth] } ) {
|
||
my $num_sc = $paren_semicolon_count[$paren_depth];
|
||
if ( $num_sc > 0 && $num_sc != 2 ) {
|
||
warning("Expected 2 ';' in 'for(;;)' but saw $num_sc\n");
|
||
}
|
||
}
|
||
|
||
if ( $paren_depth > 0 ) { $paren_depth-- }
|
||
},
|
||
',' => sub {
|
||
if ( $last_nonblank_type eq ',' ) {
|
||
complain("Repeated ','s \n");
|
||
}
|
||
|
||
# patch for operator_expected: note if we are in the list (use.t)
|
||
if ( $statement_type eq 'use' ) { $statement_type = '_use' }
|
||
## FIXME: need to move this elsewhere, perhaps check after a '('
|
||
## elsif ($last_nonblank_token eq '(') {
|
||
## warning("Leading ','s illegal in some versions of perl\n");
|
||
## }
|
||
},
|
||
';' => sub {
|
||
$context = UNKNOWN_CONTEXT;
|
||
$statement_type = '';
|
||
$want_paren = "";
|
||
|
||
# /^(for|foreach)$/
|
||
if ( $is_for_foreach{ $paren_type[$paren_depth] } )
|
||
{ # mark ; in for loop
|
||
|
||
# Be careful: we do not want a semicolon such as the
|
||
# following to be included:
|
||
#
|
||
# for (sort {strcoll($a,$b);} keys %investments) {
|
||
|
||
if ( $brace_depth == $depth_array[PAREN][BRACE][$paren_depth]
|
||
&& $square_bracket_depth ==
|
||
$depth_array[PAREN][SQUARE_BRACKET][$paren_depth] )
|
||
{
|
||
|
||
$type = 'f';
|
||
$paren_semicolon_count[$paren_depth]++;
|
||
}
|
||
}
|
||
|
||
},
|
||
'"' => sub {
|
||
error_if_expecting_OPERATOR("String")
|
||
if ( $expecting == OPERATOR );
|
||
$in_quote = 1;
|
||
$type = 'Q';
|
||
$allowed_quote_modifiers = "";
|
||
},
|
||
"'" => sub {
|
||
error_if_expecting_OPERATOR("String")
|
||
if ( $expecting == OPERATOR );
|
||
$in_quote = 1;
|
||
$type = 'Q';
|
||
$allowed_quote_modifiers = "";
|
||
},
|
||
'`' => sub {
|
||
error_if_expecting_OPERATOR("String")
|
||
if ( $expecting == OPERATOR );
|
||
$in_quote = 1;
|
||
$type = 'Q';
|
||
$allowed_quote_modifiers = "";
|
||
},
|
||
'/' => sub {
|
||
my $is_pattern;
|
||
|
||
# a pattern cannot follow certain keywords which take optional
|
||
# arguments, like 'shift' and 'pop'. See also '?'.
|
||
if ( $last_nonblank_type eq 'k'
|
||
&& $is_keyword_rejecting_slash_as_pattern_delimiter{
|
||
$last_nonblank_token} )
|
||
{
|
||
$is_pattern = 0;
|
||
}
|
||
elsif ( $expecting == UNKNOWN ) { # indeterminate, must guess..
|
||
my $msg;
|
||
( $is_pattern, $msg ) =
|
||
guess_if_pattern_or_division( $i, $rtokens, $rtoken_map,
|
||
$max_token_index );
|
||
|
||
if ($msg) {
|
||
write_diagnostics("DIVIDE:$msg\n");
|
||
write_logfile_entry($msg);
|
||
}
|
||
}
|
||
else { $is_pattern = ( $expecting == TERM ) }
|
||
|
||
if ($is_pattern) {
|
||
$in_quote = 1;
|
||
$type = 'Q';
|
||
$allowed_quote_modifiers = '[msixpodualngc]';
|
||
}
|
||
else { # not a pattern; check for a /= token
|
||
|
||
if ( $rtokens->[ $i + 1 ] eq '=' ) { # form token /=
|
||
$i++;
|
||
$tok = '/=';
|
||
$type = $tok;
|
||
}
|
||
|
||
#DEBUG - collecting info on what tokens follow a divide
|
||
# for development of guessing algorithm
|
||
#if ( numerator_expected( $i, $rtokens, $max_token_index ) < 0 ) {
|
||
# #write_diagnostics( "DIVIDE? $input_line\n" );
|
||
#}
|
||
}
|
||
},
|
||
'{' => sub {
|
||
|
||
# if we just saw a ')', we will label this block with
|
||
# its type. We need to do this to allow sub
|
||
# code_block_type to determine if this brace starts a
|
||
# code block or anonymous hash. (The type of a paren
|
||
# pair is the preceding token, such as 'if', 'else',
|
||
# etc).
|
||
$container_type = "";
|
||
|
||
# ATTRS: for a '{' following an attribute list, reset
|
||
# things to look like we just saw the sub name
|
||
# FIXME: need to end with \b here??
|
||
if ( $statement_type =~ /^sub/ ) {
|
||
$last_nonblank_token = $statement_type;
|
||
$last_nonblank_type = 'i';
|
||
$statement_type = "";
|
||
}
|
||
|
||
# patch for SWITCH/CASE: hide these keywords from an immediately
|
||
# following opening brace
|
||
elsif ( ( $statement_type eq 'case' || $statement_type eq 'when' )
|
||
&& $statement_type eq $last_nonblank_token )
|
||
{
|
||
$last_nonblank_token = ";";
|
||
}
|
||
|
||
elsif ( $last_nonblank_token eq ')' ) {
|
||
$last_nonblank_token = $paren_type[ $paren_depth + 1 ];
|
||
|
||
# defensive move in case of a nesting error (pbug.t)
|
||
# in which this ')' had no previous '('
|
||
# this nesting error will have been caught
|
||
if ( !defined($last_nonblank_token) ) {
|
||
$last_nonblank_token = 'if';
|
||
}
|
||
|
||
# check for syntax error here;
|
||
unless ( $is_blocktype_with_paren{$last_nonblank_token} ) {
|
||
if ( $tokenizer_self->[_extended_syntax_] ) {
|
||
|
||
# we append a trailing () to mark this as an unknown
|
||
# block type. This allows perltidy to format some
|
||
# common extensions of perl syntax.
|
||
# This is used by sub code_block_type
|
||
$last_nonblank_token .= '()';
|
||
}
|
||
else {
|
||
my $list =
|
||
join( ' ', sort keys %is_blocktype_with_paren );
|
||
warning(
|
||
"syntax error at ') {', didn't see one of: <<$list>>; If this code is okay try using the -xs flag\n"
|
||
);
|
||
}
|
||
}
|
||
}
|
||
|
||
# patch for paren-less for/foreach glitch, part 2.
|
||
# see note below under 'qw'
|
||
elsif ($last_nonblank_token eq 'qw'
|
||
&& $is_for_foreach{$want_paren} )
|
||
{
|
||
$last_nonblank_token = $want_paren;
|
||
if ( $last_last_nonblank_token eq $want_paren ) {
|
||
warning(
|
||
"syntax error at '$want_paren .. {' -- missing \$ loop variable\n"
|
||
);
|
||
|
||
}
|
||
$want_paren = "";
|
||
}
|
||
|
||
# now identify which of the three possible types of
|
||
# curly braces we have: hash index container, anonymous
|
||
# hash reference, or code block.
|
||
|
||
# non-structural (hash index) curly brace pair
|
||
# get marked 'L' and 'R'
|
||
if ( is_non_structural_brace() ) {
|
||
$type = 'L';
|
||
|
||
# patch for SWITCH/CASE:
|
||
# allow paren-less identifier after 'when'
|
||
# if the brace is preceded by a space
|
||
if ( $statement_type eq 'when'
|
||
&& $last_nonblank_type eq 'i'
|
||
&& $last_last_nonblank_type eq 'k'
|
||
&& ( $i_tok == 0 || $rtoken_type->[ $i_tok - 1 ] eq 'b' ) )
|
||
{
|
||
$type = '{';
|
||
$block_type = $statement_type;
|
||
}
|
||
}
|
||
|
||
# code and anonymous hash have the same type, '{', but are
|
||
# distinguished by 'block_type',
|
||
# which will be blank for an anonymous hash
|
||
else {
|
||
|
||
$block_type = code_block_type( $i_tok, $rtokens, $rtoken_type,
|
||
$max_token_index );
|
||
|
||
# patch to promote bareword type to function taking block
|
||
if ( $block_type
|
||
&& $last_nonblank_type eq 'w'
|
||
&& $last_nonblank_i >= 0 )
|
||
{
|
||
if ( $routput_token_type->[$last_nonblank_i] eq 'w' ) {
|
||
$routput_token_type->[$last_nonblank_i] = 'G';
|
||
}
|
||
}
|
||
|
||
# patch for SWITCH/CASE: if we find a stray opening block brace
|
||
# where we might accept a 'case' or 'when' block, then take it
|
||
if ( $statement_type eq 'case'
|
||
|| $statement_type eq 'when' )
|
||
{
|
||
if ( !$block_type || $block_type eq '}' ) {
|
||
$block_type = $statement_type;
|
||
}
|
||
}
|
||
}
|
||
|
||
$brace_type[ ++$brace_depth ] = $block_type;
|
||
$brace_package[$brace_depth] = $current_package;
|
||
$brace_structural_type[$brace_depth] = $type;
|
||
$brace_context[$brace_depth] = $context;
|
||
( $type_sequence, $indent_flag ) =
|
||
increase_nesting_depth( BRACE, $rtoken_map->[$i_tok] );
|
||
},
|
||
'}' => sub {
|
||
$block_type = $brace_type[$brace_depth];
|
||
if ($block_type) { $statement_type = '' }
|
||
if ( defined( $brace_package[$brace_depth] ) ) {
|
||
$current_package = $brace_package[$brace_depth];
|
||
}
|
||
|
||
# can happen on brace error (caught elsewhere)
|
||
else {
|
||
}
|
||
( $type_sequence, $indent_flag ) =
|
||
decrease_nesting_depth( BRACE, $rtoken_map->[$i_tok] );
|
||
|
||
if ( $brace_structural_type[$brace_depth] eq 'L' ) {
|
||
$type = 'R';
|
||
}
|
||
|
||
# propagate type information for 'do' and 'eval' blocks, and also
|
||
# for smartmatch operator. This is necessary to enable us to know
|
||
# if an operator or term is expected next.
|
||
if ( $is_block_operator{$block_type} ) {
|
||
$tok = $block_type;
|
||
}
|
||
|
||
$context = $brace_context[$brace_depth];
|
||
if ( $brace_depth > 0 ) { $brace_depth--; }
|
||
},
|
||
'&' => sub { # maybe sub call? start looking
|
||
|
||
# We have to check for sub call unless we are sure we
|
||
# are expecting an operator. This example from s2p
|
||
# got mistaken as a q operator in an early version:
|
||
# print BODY &q(<<'EOT');
|
||
if ( $expecting != OPERATOR ) {
|
||
|
||
# But only look for a sub call if we are expecting a term or
|
||
# if there is no existing space after the &.
|
||
# For example we probably don't want & as sub call here:
|
||
# Fcntl::S_IRUSR & $mode;
|
||
if ( $expecting == TERM || $next_type ne 'b' ) {
|
||
scan_identifier_fast();
|
||
}
|
||
}
|
||
else {
|
||
}
|
||
},
|
||
'<' => sub { # angle operator or less than?
|
||
|
||
if ( $expecting != OPERATOR ) {
|
||
( $i, $type ) =
|
||
find_angle_operator_termination( $input_line, $i, $rtoken_map,
|
||
$expecting, $max_token_index );
|
||
|
||
## This message is not very helpful and quite confusing if the above
|
||
## routine decided not to write a message with the line number.
|
||
## if ( $type eq '<' && $expecting == TERM ) {
|
||
## error_if_expecting_TERM();
|
||
## interrupt_logfile();
|
||
## warning("Unterminated <> operator?\n");
|
||
## resume_logfile();
|
||
## }
|
||
|
||
}
|
||
else {
|
||
}
|
||
},
|
||
'?' => sub { # ?: conditional or starting pattern?
|
||
|
||
my $is_pattern;
|
||
|
||
# Patch for rt #126965
|
||
# a pattern cannot follow certain keywords which take optional
|
||
# arguments, like 'shift' and 'pop'. See also '/'.
|
||
if ( $last_nonblank_type eq 'k'
|
||
&& $is_keyword_rejecting_question_as_pattern_delimiter{
|
||
$last_nonblank_token} )
|
||
{
|
||
$is_pattern = 0;
|
||
}
|
||
|
||
# patch for RT#131288, user constant function without prototype
|
||
# last type is 'U' followed by ?.
|
||
elsif ( $last_nonblank_type =~ /^[FUY]$/ ) {
|
||
$is_pattern = 0;
|
||
}
|
||
elsif ( $expecting == UNKNOWN ) {
|
||
|
||
# In older versions of Perl, a bare ? can be a pattern
|
||
# delimiter. In perl version 5.22 this was
|
||
# dropped, but we have to support it in order to format
|
||
# older programs. See:
|
||
## https://perl.developpez.com/documentations/en/5.22.0/perl5211delta.html
|
||
# For example, the following line worked
|
||
# at one time:
|
||
# ?(.*)? && (print $1,"\n");
|
||
# In current versions it would have to be written with slashes:
|
||
# /(.*)/ && (print $1,"\n");
|
||
my $msg;
|
||
( $is_pattern, $msg ) =
|
||
guess_if_pattern_or_conditional( $i, $rtokens, $rtoken_map,
|
||
$max_token_index );
|
||
|
||
if ($msg) { write_logfile_entry($msg) }
|
||
}
|
||
else { $is_pattern = ( $expecting == TERM ) }
|
||
|
||
if ($is_pattern) {
|
||
$in_quote = 1;
|
||
$type = 'Q';
|
||
$allowed_quote_modifiers = '[msixpodualngc]';
|
||
}
|
||
else {
|
||
( $type_sequence, $indent_flag ) =
|
||
increase_nesting_depth( QUESTION_COLON,
|
||
$rtoken_map->[$i_tok] );
|
||
}
|
||
},
|
||
'*' => sub { # typeglob, or multiply?
|
||
|
||
if ( $expecting == TERM ) {
|
||
scan_identifier_fast();
|
||
}
|
||
else {
|
||
|
||
if ( $rtokens->[ $i + 1 ] eq '=' ) {
|
||
$tok = '*=';
|
||
$type = $tok;
|
||
$i++;
|
||
}
|
||
elsif ( $rtokens->[ $i + 1 ] eq '*' ) {
|
||
$tok = '**';
|
||
$type = $tok;
|
||
$i++;
|
||
if ( $rtokens->[ $i + 1 ] eq '=' ) {
|
||
$tok = '**=';
|
||
$type = $tok;
|
||
$i++;
|
||
}
|
||
}
|
||
}
|
||
},
|
||
'.' => sub { # what kind of . ?
|
||
|
||
if ( $expecting != OPERATOR ) {
|
||
scan_number();
|
||
if ( $type eq '.' ) {
|
||
error_if_expecting_TERM()
|
||
if ( $expecting == TERM );
|
||
}
|
||
}
|
||
else {
|
||
}
|
||
},
|
||
':' => sub {
|
||
|
||
# if this is the first nonblank character, call it a label
|
||
# since perl seems to just swallow it
|
||
if ( $input_line_number == 1 && $last_nonblank_i == -1 ) {
|
||
$type = 'J';
|
||
}
|
||
|
||
# ATTRS: check for a ':' which introduces an attribute list
|
||
# either after a 'sub' keyword or within a paren list
|
||
elsif ( $statement_type =~ /^sub\b/ ) {
|
||
$type = 'A';
|
||
$in_attribute_list = 1;
|
||
}
|
||
|
||
# Withing a signature, unless we are in a ternary. For example,
|
||
# from 't/filter_example.t':
|
||
# method foo4 ( $class: $bar ) { $class->bar($bar) }
|
||
elsif ( $paren_type[$paren_depth] =~ /^sub\b/
|
||
&& !is_balanced_closing_container(QUESTION_COLON) )
|
||
{
|
||
$type = 'A';
|
||
$in_attribute_list = 1;
|
||
}
|
||
|
||
# check for scalar attribute, such as
|
||
# my $foo : shared = 1;
|
||
elsif ($is_my_our_state{$statement_type}
|
||
&& $current_depth[QUESTION_COLON] == 0 )
|
||
{
|
||
$type = 'A';
|
||
$in_attribute_list = 1;
|
||
}
|
||
|
||
# Look for Switch::Plain syntax if an error would otherwise occur
|
||
# here. Note that we do not need to check if the extended syntax
|
||
# flag is set because otherwise an error would occur, and we would
|
||
# then have to output a message telling the user to set the
|
||
# extended syntax flag to avoid the error.
|
||
# case 1: {
|
||
# default: {
|
||
# default:
|
||
# Note that the line 'default:' will be parsed as a label elsewhere.
|
||
elsif ( $is_case_default{$statement_type}
|
||
&& !is_balanced_closing_container(QUESTION_COLON) )
|
||
{
|
||
# mark it as a perltidy label type
|
||
$type = 'J';
|
||
}
|
||
|
||
# otherwise, it should be part of a ?/: operator
|
||
else {
|
||
( $type_sequence, $indent_flag ) =
|
||
decrease_nesting_depth( QUESTION_COLON,
|
||
$rtoken_map->[$i_tok] );
|
||
if ( $last_nonblank_token eq '?' ) {
|
||
warning("Syntax error near ? :\n");
|
||
}
|
||
}
|
||
},
|
||
'+' => sub { # what kind of plus?
|
||
|
||
if ( $expecting == TERM ) {
|
||
my $number = scan_number_fast();
|
||
|
||
# unary plus is safest assumption if not a number
|
||
if ( !defined($number) ) { $type = 'p'; }
|
||
}
|
||
elsif ( $expecting == OPERATOR ) {
|
||
}
|
||
else {
|
||
if ( $next_type eq 'w' ) { $type = 'p' }
|
||
}
|
||
},
|
||
'@' => sub {
|
||
|
||
error_if_expecting_OPERATOR("Array")
|
||
if ( $expecting == OPERATOR );
|
||
scan_identifier_fast();
|
||
},
|
||
'%' => sub { # hash or modulo?
|
||
|
||
# first guess is hash if no following blank
|
||
if ( $expecting == UNKNOWN ) {
|
||
if ( $next_type ne 'b' ) { $expecting = TERM }
|
||
}
|
||
if ( $expecting == TERM ) {
|
||
scan_identifier_fast();
|
||
}
|
||
},
|
||
'[' => sub {
|
||
$square_bracket_type[ ++$square_bracket_depth ] =
|
||
$last_nonblank_token;
|
||
( $type_sequence, $indent_flag ) =
|
||
increase_nesting_depth( SQUARE_BRACKET, $rtoken_map->[$i_tok] );
|
||
|
||
# It may seem odd, but structural square brackets have
|
||
# type '{' and '}'. This simplifies the indentation logic.
|
||
if ( !is_non_structural_brace() ) {
|
||
$type = '{';
|
||
}
|
||
$square_bracket_structural_type[$square_bracket_depth] = $type;
|
||
},
|
||
']' => sub {
|
||
( $type_sequence, $indent_flag ) =
|
||
decrease_nesting_depth( SQUARE_BRACKET, $rtoken_map->[$i_tok] );
|
||
|
||
if ( $square_bracket_structural_type[$square_bracket_depth] eq '{' )
|
||
{
|
||
$type = '}';
|
||
}
|
||
|
||
# propagate type information for smartmatch operator. This is
|
||
# necessary to enable us to know if an operator or term is expected
|
||
# next.
|
||
if ( $square_bracket_type[$square_bracket_depth] eq '~~' ) {
|
||
$tok = $square_bracket_type[$square_bracket_depth];
|
||
}
|
||
|
||
if ( $square_bracket_depth > 0 ) { $square_bracket_depth--; }
|
||
},
|
||
'-' => sub { # what kind of minus?
|
||
|
||
if ( ( $expecting != OPERATOR )
|
||
&& $is_file_test_operator{$next_tok} )
|
||
{
|
||
my ( $next_nonblank_token, $i_next ) =
|
||
find_next_nonblank_token( $i + 1, $rtokens,
|
||
$max_token_index );
|
||
|
||
# check for a quoted word like "-w=>xx";
|
||
# it is sufficient to just check for a following '='
|
||
if ( $next_nonblank_token eq '=' ) {
|
||
$type = 'm';
|
||
}
|
||
else {
|
||
$i++;
|
||
$tok .= $next_tok;
|
||
$type = 'F';
|
||
}
|
||
}
|
||
elsif ( $expecting == TERM ) {
|
||
my $number = scan_number_fast();
|
||
|
||
# maybe part of bareword token? unary is safest
|
||
if ( !defined($number) ) { $type = 'm'; }
|
||
|
||
}
|
||
elsif ( $expecting == OPERATOR ) {
|
||
}
|
||
else {
|
||
|
||
if ( $next_type eq 'w' ) {
|
||
$type = 'm';
|
||
}
|
||
}
|
||
},
|
||
|
||
'^' => sub {
|
||
|
||
# check for special variables like ${^WARNING_BITS}
|
||
if ( $expecting == TERM ) {
|
||
|
||
# FIXME: this should work but will not catch errors
|
||
# because we also have to be sure that previous token is
|
||
# a type character ($,@,%).
|
||
if ( $last_nonblank_token eq '{'
|
||
&& ( $next_tok !~ /^\d/ )
|
||
&& ( $next_tok =~ /^\w/ ) )
|
||
{
|
||
|
||
if ( $next_tok eq 'W' ) {
|
||
$tokenizer_self->[_saw_perl_dash_w_] = 1;
|
||
}
|
||
$tok = $tok . $next_tok;
|
||
$i = $i + 1;
|
||
$type = 'w';
|
||
}
|
||
|
||
else {
|
||
unless ( error_if_expecting_TERM() ) {
|
||
|
||
# Something like this is valid but strange:
|
||
# undef ^I;
|
||
complain("The '^' seems unusual here\n");
|
||
}
|
||
}
|
||
}
|
||
},
|
||
|
||
'::' => sub { # probably a sub call
|
||
scan_bare_identifier();
|
||
},
|
||
'<<' => sub { # maybe a here-doc?
|
||
|
||
## This check removed because it could be a deprecated here-doc with
|
||
## no specified target. See example in log 16 Sep 2020.
|
||
## return
|
||
## unless ( $i < $max_token_index )
|
||
## ; # here-doc not possible if end of line
|
||
|
||
if ( $expecting != OPERATOR ) {
|
||
my ( $found_target, $here_doc_target, $here_quote_character,
|
||
$saw_error );
|
||
(
|
||
$found_target, $here_doc_target, $here_quote_character, $i,
|
||
$saw_error
|
||
)
|
||
= find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
|
||
$max_token_index );
|
||
|
||
if ($found_target) {
|
||
push @{$rhere_target_list},
|
||
[ $here_doc_target, $here_quote_character ];
|
||
$type = 'h';
|
||
if ( length($here_doc_target) > 80 ) {
|
||
my $truncated = substr( $here_doc_target, 0, 80 );
|
||
complain("Long here-target: '$truncated' ...\n");
|
||
}
|
||
elsif ( !$here_doc_target ) {
|
||
warning(
|
||
'Use of bare << to mean <<"" is deprecated' . "\n" )
|
||
unless ($here_quote_character);
|
||
}
|
||
elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
|
||
complain(
|
||
"Unconventional here-target: '$here_doc_target'\n");
|
||
}
|
||
}
|
||
elsif ( $expecting == TERM ) {
|
||
unless ($saw_error) {
|
||
|
||
# shouldn't happen..
|
||
warning("Program bug; didn't find here doc target\n");
|
||
report_definite_bug();
|
||
}
|
||
}
|
||
}
|
||
else {
|
||
}
|
||
},
|
||
'<<~' => sub { # a here-doc, new type added in v26
|
||
return
|
||
unless ( $i < $max_token_index )
|
||
; # here-doc not possible if end of line
|
||
if ( $expecting != OPERATOR ) {
|
||
my ( $found_target, $here_doc_target, $here_quote_character,
|
||
$saw_error );
|
||
(
|
||
$found_target, $here_doc_target, $here_quote_character, $i,
|
||
$saw_error
|
||
)
|
||
= find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
|
||
$max_token_index );
|
||
|
||
if ($found_target) {
|
||
|
||
if ( length($here_doc_target) > 80 ) {
|
||
my $truncated = substr( $here_doc_target, 0, 80 );
|
||
complain("Long here-target: '$truncated' ...\n");
|
||
}
|
||
elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
|
||
complain(
|
||
"Unconventional here-target: '$here_doc_target'\n");
|
||
}
|
||
|
||
# Note that we put a leading space on the here quote
|
||
# character indicate that it may be preceded by spaces
|
||
$here_quote_character = " " . $here_quote_character;
|
||
push @{$rhere_target_list},
|
||
[ $here_doc_target, $here_quote_character ];
|
||
$type = 'h';
|
||
}
|
||
elsif ( $expecting == TERM ) {
|
||
unless ($saw_error) {
|
||
|
||
# shouldn't happen..
|
||
warning("Program bug; didn't find here doc target\n");
|
||
report_definite_bug();
|
||
}
|
||
}
|
||
}
|
||
else {
|
||
}
|
||
},
|
||
'->' => sub {
|
||
|
||
# if -> points to a bare word, we must scan for an identifier,
|
||
# otherwise something like ->y would look like the y operator
|
||
scan_identifier_fast();
|
||
},
|
||
|
||
# type = 'pp' for pre-increment, '++' for post-increment
|
||
'++' => sub {
|
||
if ( $expecting == TERM ) { $type = 'pp' }
|
||
elsif ( $expecting == UNKNOWN ) {
|
||
my ( $next_nonblank_token, $i_next ) =
|
||
find_next_nonblank_token( $i, $rtokens, $max_token_index );
|
||
if ( $next_nonblank_token eq '$' ) { $type = 'pp' }
|
||
}
|
||
},
|
||
|
||
'=>' => sub {
|
||
if ( $last_nonblank_type eq $tok ) {
|
||
complain("Repeated '=>'s \n");
|
||
}
|
||
|
||
# patch for operator_expected: note if we are in the list (use.t)
|
||
# TODO: make version numbers a new token type
|
||
if ( $statement_type eq 'use' ) { $statement_type = '_use' }
|
||
},
|
||
|
||
# type = 'mm' for pre-decrement, '--' for post-decrement
|
||
'--' => sub {
|
||
|
||
if ( $expecting == TERM ) { $type = 'mm' }
|
||
elsif ( $expecting == UNKNOWN ) {
|
||
my ( $next_nonblank_token, $i_next ) =
|
||
find_next_nonblank_token( $i, $rtokens, $max_token_index );
|
||
if ( $next_nonblank_token eq '$' ) { $type = 'mm' }
|
||
}
|
||
},
|
||
|
||
'&&' => sub {
|
||
error_if_expecting_TERM()
|
||
if ( $expecting == TERM );
|
||
},
|
||
|
||
'||' => sub {
|
||
error_if_expecting_TERM()
|
||
if ( $expecting == TERM );
|
||
},
|
||
|
||
'//' => sub {
|
||
error_if_expecting_TERM()
|
||
if ( $expecting == TERM );
|
||
},
|
||
};
|
||
|
||
# ------------------------------------------------------------
|
||
# end hash of code for handling individual token types
|
||
# ------------------------------------------------------------
|
||
|
||
my %matching_start_token = ( '}' => '{', ']' => '[', ')' => '(' );
|
||
|
||
# These block types terminate statements and do not need a trailing
|
||
# semicolon
|
||
# patched for SWITCH/CASE/
|
||
my %is_zero_continuation_block_type;
|
||
@_ = qw( } { BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue ;
|
||
if elsif else unless while until for foreach switch case given when);
|
||
@is_zero_continuation_block_type{@_} = (1) x scalar(@_);
|
||
|
||
my %is_not_zero_continuation_block_type;
|
||
@_ = qw(sort grep map do eval);
|
||
@is_not_zero_continuation_block_type{@_} = (1) x scalar(@_);
|
||
|
||
my %is_logical_container;
|
||
@_ = qw(if elsif unless while and or err not && ! || for foreach);
|
||
@is_logical_container{@_} = (1) x scalar(@_);
|
||
|
||
my %is_binary_type;
|
||
@_ = qw(|| &&);
|
||
@is_binary_type{@_} = (1) x scalar(@_);
|
||
|
||
my %is_binary_keyword;
|
||
@_ = qw(and or err eq ne cmp);
|
||
@is_binary_keyword{@_} = (1) x scalar(@_);
|
||
|
||
# 'L' is token for opening { at hash key
|
||
my %is_opening_type;
|
||
@_ = qw< L { ( [ >;
|
||
@is_opening_type{@_} = (1) x scalar(@_);
|
||
|
||
# 'R' is token for closing } at hash key
|
||
my %is_closing_type;
|
||
@_ = qw< R } ) ] >;
|
||
@is_closing_type{@_} = (1) x scalar(@_);
|
||
|
||
my %is_redo_last_next_goto;
|
||
@_ = qw(redo last next goto);
|
||
@is_redo_last_next_goto{@_} = (1) x scalar(@_);
|
||
|
||
my %is_use_require;
|
||
@_ = qw(use require);
|
||
@is_use_require{@_} = (1) x scalar(@_);
|
||
|
||
# This hash holds the array index in $tokenizer_self for these keywords:
|
||
my %is_format_END_DATA = (
|
||
'format' => _in_format_,
|
||
'__END__' => _in_end_,
|
||
'__DATA__' => _in_data_,
|
||
);
|
||
|
||
# original ref: camel 3 p 147,
|
||
# but perl may accept undocumented flags
|
||
# perl 5.10 adds 'p' (preserve)
|
||
# Perl version 5.22 added 'n'
|
||
# From http://perldoc.perl.org/perlop.html we have
|
||
# /PATTERN/msixpodualngc or m?PATTERN?msixpodualngc
|
||
# s/PATTERN/REPLACEMENT/msixpodualngcer
|
||
# y/SEARCHLIST/REPLACEMENTLIST/cdsr
|
||
# tr/SEARCHLIST/REPLACEMENTLIST/cdsr
|
||
# qr/STRING/msixpodualn
|
||
my %quote_modifiers = (
|
||
's' => '[msixpodualngcer]',
|
||
'y' => '[cdsr]',
|
||
'tr' => '[cdsr]',
|
||
'm' => '[msixpodualngc]',
|
||
'qr' => '[msixpodualn]',
|
||
'q' => "",
|
||
'qq' => "",
|
||
'qw' => "",
|
||
'qx' => "",
|
||
);
|
||
|
||
# table showing how many quoted things to look for after quote operator..
|
||
# s, y, tr have 2 (pattern and replacement)
|
||
# others have 1 (pattern only)
|
||
my %quote_items = (
|
||
's' => 2,
|
||
'y' => 2,
|
||
'tr' => 2,
|
||
'm' => 1,
|
||
'qr' => 1,
|
||
'q' => 1,
|
||
'qq' => 1,
|
||
'qw' => 1,
|
||
'qx' => 1,
|
||
);
|
||
|
||
use constant DEBUG_TOKENIZE => 0;
|
||
|
||
sub tokenize_this_line {
|
||
|
||
# This routine breaks a line of perl code into tokens which are of use in
|
||
# indentation and reformatting. One of my goals has been to define tokens
|
||
# such that a newline may be inserted between any pair of tokens without
|
||
# changing or invalidating the program. This version comes close to this,
|
||
# although there are necessarily a few exceptions which must be caught by
|
||
# the formatter. Many of these involve the treatment of bare words.
|
||
#
|
||
# The tokens and their types are returned in arrays. See previous
|
||
# routine for their names.
|
||
#
|
||
# See also the array "valid_token_types" in the BEGIN section for an
|
||
# up-to-date list.
|
||
#
|
||
# To simplify things, token types are either a single character, or they
|
||
# are identical to the tokens themselves.
|
||
#
|
||
# As a debugging aid, the -D flag creates a file containing a side-by-side
|
||
# comparison of the input string and its tokenization for each line of a file.
|
||
# This is an invaluable debugging aid.
|
||
#
|
||
# In addition to tokens, and some associated quantities, the tokenizer
|
||
# also returns flags indication any special line types. These include
|
||
# quotes, here_docs, formats.
|
||
#
|
||
# -----------------------------------------------------------------------
|
||
#
|
||
# How to add NEW_TOKENS:
|
||
#
|
||
# New token types will undoubtedly be needed in the future both to keep up
|
||
# with changes in perl and to help adapt the tokenizer to other applications.
|
||
#
|
||
# Here are some notes on the minimal steps. I wrote these notes while
|
||
# adding the 'v' token type for v-strings, which are things like version
|
||
# numbers 5.6.0, and ip addresses, and will use that as an example. ( You
|
||
# can use your editor to search for the string "NEW_TOKENS" to find the
|
||
# appropriate sections to change):
|
||
#
|
||
# *. Try to talk somebody else into doing it! If not, ..
|
||
#
|
||
# *. Make a backup of your current version in case things don't work out!
|
||
#
|
||
# *. Think of a new, unused character for the token type, and add to
|
||
# the array @valid_token_types in the BEGIN section of this package.
|
||
# For example, I used 'v' for v-strings.
|
||
#
|
||
# *. Implement coding to recognize the $type of the token in this routine.
|
||
# This is the hardest part, and is best done by imitating or modifying
|
||
# some of the existing coding. For example, to recognize v-strings, I
|
||
# patched 'sub scan_bare_identifier' to recognize v-strings beginning with
|
||
# 'v' and 'sub scan_number' to recognize v-strings without the leading 'v'.
|
||
#
|
||
# *. Update sub operator_expected. This update is critically important but
|
||
# the coding is trivial. Look at the comments in that routine for help.
|
||
# For v-strings, which should behave like numbers, I just added 'v' to the
|
||
# regex used to handle numbers and strings (types 'n' and 'Q').
|
||
#
|
||
# *. Implement a 'bond strength' rule in sub set_bond_strengths in
|
||
# Perl::Tidy::Formatter for breaking lines around this token type. You can
|
||
# skip this step and take the default at first, then adjust later to get
|
||
# desired results. For adding type 'v', I looked at sub bond_strength and
|
||
# saw that number type 'n' was using default strengths, so I didn't do
|
||
# anything. I may tune it up someday if I don't like the way line
|
||
# breaks with v-strings look.
|
||
#
|
||
# *. Implement a 'whitespace' rule in sub set_whitespace_flags in
|
||
# Perl::Tidy::Formatter. For adding type 'v', I looked at this routine
|
||
# and saw that type 'n' used spaces on both sides, so I just added 'v'
|
||
# to the array @spaces_both_sides.
|
||
#
|
||
# *. Update HtmlWriter package so that users can colorize the token as
|
||
# desired. This is quite easy; see comments identified by 'NEW_TOKENS' in
|
||
# that package. For v-strings, I initially chose to use a default color
|
||
# equal to the default for numbers, but it might be nice to change that
|
||
# eventually.
|
||
#
|
||
# *. Update comments in Perl::Tidy::Tokenizer::dump_token_types.
|
||
#
|
||
# *. Run lots and lots of debug tests. Start with special files designed
|
||
# to test the new token type. Run with the -D flag to create a .DEBUG
|
||
# file which shows the tokenization. When these work ok, test as many old
|
||
# scripts as possible. Start with all of the '.t' files in the 'test'
|
||
# directory of the distribution file. Compare .tdy output with previous
|
||
# version and updated version to see the differences. Then include as
|
||
# many more files as possible. My own technique has been to collect a huge
|
||
# number of perl scripts (thousands!) into one directory and run perltidy
|
||
# *, then run diff between the output of the previous version and the
|
||
# current version.
|
||
#
|
||
# *. For another example, search for the smartmatch operator '~~'
|
||
# with your editor to see where updates were made for it.
|
||
#
|
||
# -----------------------------------------------------------------------
|
||
|
||
my $line_of_tokens = shift;
|
||
my ($untrimmed_input_line) = $line_of_tokens->{_line_text};
|
||
|
||
# patch while coding change is underway
|
||
# make callers private data to allow access
|
||
# $tokenizer_self = $caller_tokenizer_self;
|
||
|
||
# extract line number for use in error messages
|
||
$input_line_number = $line_of_tokens->{_line_number};
|
||
|
||
# reinitialize for multi-line quote
|
||
$line_of_tokens->{_starting_in_quote} = $in_quote && $quote_type eq 'Q';
|
||
|
||
# check for pod documentation
|
||
if ( substr( $untrimmed_input_line, 0, 1 ) eq '='
|
||
&& $untrimmed_input_line =~ /^=[A-Za-z_]/ )
|
||
{
|
||
|
||
# must not be in multi-line quote
|
||
# and must not be in an equation
|
||
if ( !$in_quote
|
||
&& ( operator_expected( [ 'b', '=', 'b' ] ) == TERM ) )
|
||
{
|
||
$tokenizer_self->[_in_pod_] = 1;
|
||
return;
|
||
}
|
||
}
|
||
|
||
$input_line = $untrimmed_input_line;
|
||
|
||
chomp $input_line;
|
||
|
||
# Set a flag to indicate if we might be at an __END__ or __DATA__ line
|
||
# This will be used below to avoid quoting a bare word followed by
|
||
# a fat comma.
|
||
my $is_END_or_DATA;
|
||
|
||
# trim start of this line unless we are continuing a quoted line
|
||
# do not trim end because we might end in a quote (test: deken4.pl)
|
||
# Perl::Tidy::Formatter will delete needless trailing blanks
|
||
unless ( $in_quote && ( $quote_type eq 'Q' ) ) {
|
||
$input_line =~ s/^\s+//; # trim left end
|
||
|
||
$is_END_or_DATA = substr( $input_line, 0, 1 ) eq '_'
|
||
&& $input_line =~ /^\s*__(END|DATA)__\s*$/;
|
||
}
|
||
|
||
# update the copy of the line for use in error messages
|
||
# This must be exactly what we give the pre_tokenizer
|
||
$tokenizer_self->[_line_of_text_] = $input_line;
|
||
|
||
# re-initialize for the main loop
|
||
$routput_token_list = []; # stack of output token indexes
|
||
$routput_token_type = []; # token types
|
||
$routput_block_type = []; # types of code block
|
||
$routput_container_type = []; # paren types, such as if, elsif, ..
|
||
$routput_type_sequence = []; # nesting sequential number
|
||
|
||
$rhere_target_list = [];
|
||
|
||
$tok = $last_nonblank_token;
|
||
$type = $last_nonblank_type;
|
||
$prototype = $last_nonblank_prototype;
|
||
$last_nonblank_i = -1;
|
||
$block_type = $last_nonblank_block_type;
|
||
$container_type = $last_nonblank_container_type;
|
||
$type_sequence = $last_nonblank_type_sequence;
|
||
$indent_flag = 0;
|
||
$peeked_ahead = 0;
|
||
|
||
# tokenization is done in two stages..
|
||
# stage 1 is a very simple pre-tokenization
|
||
my $max_tokens_wanted = 0; # this signals pre_tokenize to get all tokens
|
||
|
||
# a little optimization for a full-line comment
|
||
if ( !$in_quote && substr( $input_line, 0, 1 ) eq '#' ) {
|
||
$max_tokens_wanted = 1 # no use tokenizing a comment
|
||
}
|
||
|
||
# start by breaking the line into pre-tokens
|
||
( $rtokens, $rtoken_map, $rtoken_type ) =
|
||
pre_tokenize( $input_line, $max_tokens_wanted );
|
||
|
||
$max_token_index = scalar( @{$rtokens} ) - 1;
|
||
push( @{$rtokens}, ' ', ' ', ' ' ); # extra whitespace simplifies logic
|
||
push( @{$rtoken_map}, 0, 0, 0 ); # shouldn't be referenced
|
||
push( @{$rtoken_type}, 'b', 'b', 'b' );
|
||
|
||
# initialize for main loop
|
||
foreach my $ii ( 0 .. $max_token_index + 3 ) {
|
||
$routput_token_type->[$ii] = "";
|
||
$routput_block_type->[$ii] = "";
|
||
$routput_container_type->[$ii] = "";
|
||
$routput_type_sequence->[$ii] = "";
|
||
$routput_indent_flag->[$ii] = 0;
|
||
}
|
||
$i = -1;
|
||
$i_tok = -1;
|
||
|
||
# ------------------------------------------------------------
|
||
# begin main tokenization loop
|
||
# ------------------------------------------------------------
|
||
|
||
# we are looking at each pre-token of one line and combining them
|
||
# into tokens
|
||
while ( ++$i <= $max_token_index ) {
|
||
|
||
if ($in_quote) { # continue looking for end of a quote
|
||
$type = $quote_type;
|
||
|
||
unless ( @{$routput_token_list} )
|
||
{ # initialize if continuation line
|
||
push( @{$routput_token_list}, $i );
|
||
$routput_token_type->[$i] = $type;
|
||
|
||
}
|
||
$tok = $quote_character if ($quote_character);
|
||
|
||
# scan for the end of the quote or pattern
|
||
(
|
||
$i, $in_quote, $quote_character, $quote_pos, $quote_depth,
|
||
$quoted_string_1, $quoted_string_2
|
||
)
|
||
= do_quote(
|
||
$i, $in_quote, $quote_character,
|
||
$quote_pos, $quote_depth, $quoted_string_1,
|
||
$quoted_string_2, $rtokens, $rtoken_map,
|
||
$max_token_index
|
||
);
|
||
|
||
# all done if we didn't find it
|
||
last if ($in_quote);
|
||
|
||
# save pattern and replacement text for rescanning
|
||
my $qs1 = $quoted_string_1;
|
||
my $qs2 = $quoted_string_2;
|
||
|
||
# re-initialize for next search
|
||
$quote_character = '';
|
||
$quote_pos = 0;
|
||
$quote_type = 'Q';
|
||
$quoted_string_1 = "";
|
||
$quoted_string_2 = "";
|
||
last if ( ++$i > $max_token_index );
|
||
|
||
# look for any modifiers
|
||
if ($allowed_quote_modifiers) {
|
||
|
||
# check for exact quote modifiers
|
||
if ( $rtokens->[$i] =~ /^[A-Za-z_]/ ) {
|
||
my $str = $rtokens->[$i];
|
||
my $saw_modifier_e;
|
||
while ( $str =~ /\G$allowed_quote_modifiers/gc ) {
|
||
my $pos = pos($str);
|
||
my $char = substr( $str, $pos - 1, 1 );
|
||
$saw_modifier_e ||= ( $char eq 'e' );
|
||
}
|
||
|
||
# For an 'e' quote modifier we must scan the replacement
|
||
# text for here-doc targets...
|
||
# but if the modifier starts a new line we can skip
|
||
# this because either the here doc will be fully
|
||
# contained in the replacement text (so we can
|
||
# ignore it) or Perl will not find it.
|
||
# See test 'here2.in'.
|
||
if ( $saw_modifier_e && $i_tok >= 0 ) {
|
||
|
||
my $rht = scan_replacement_text($qs1);
|
||
|
||
# Change type from 'Q' to 'h' for quotes with
|
||
# here-doc targets so that the formatter (see sub
|
||
# process_line_of_CODE) will not make any line
|
||
# breaks after this point.
|
||
if ($rht) {
|
||
push @{$rhere_target_list}, @{$rht};
|
||
$type = 'h';
|
||
if ( $i_tok < 0 ) {
|
||
my $ilast = $routput_token_list->[-1];
|
||
$routput_token_type->[$ilast] = $type;
|
||
}
|
||
}
|
||
}
|
||
|
||
if ( defined( pos($str) ) ) {
|
||
|
||
# matched
|
||
if ( pos($str) == length($str) ) {
|
||
last if ( ++$i > $max_token_index );
|
||
}
|
||
|
||
# Looks like a joined quote modifier
|
||
# and keyword, maybe something like
|
||
# s/xxx/yyy/gefor @k=...
|
||
# Example is "galgen.pl". Would have to split
|
||
# the word and insert a new token in the
|
||
# pre-token list. This is so rare that I haven't
|
||
# done it. Will just issue a warning citation.
|
||
|
||
# This error might also be triggered if my quote
|
||
# modifier characters are incomplete
|
||
else {
|
||
warning(<<EOM);
|
||
|
||
Partial match to quote modifier $allowed_quote_modifiers at word: '$str'
|
||
Please put a space between quote modifiers and trailing keywords.
|
||
EOM
|
||
|
||
# print "token $rtokens->[$i]\n";
|
||
# my $num = length($str) - pos($str);
|
||
# $rtokens->[$i]=substr($rtokens->[$i],pos($str),$num);
|
||
# print "continuing with new token $rtokens->[$i]\n";
|
||
|
||
# skipping past this token does least damage
|
||
last if ( ++$i > $max_token_index );
|
||
}
|
||
}
|
||
else {
|
||
|
||
# example file: rokicki4.pl
|
||
# This error might also be triggered if my quote
|
||
# modifier characters are incomplete
|
||
write_logfile_entry(
|
||
"Note: found word $str at quote modifier location\n"
|
||
);
|
||
}
|
||
}
|
||
|
||
# re-initialize
|
||
$allowed_quote_modifiers = "";
|
||
}
|
||
}
|
||
|
||
unless ( $type eq 'b' || $tok eq 'CORE::' ) {
|
||
|
||
# try to catch some common errors
|
||
if ( ( $type eq 'n' ) && ( $tok ne '0' ) ) {
|
||
|
||
if ( $last_nonblank_token eq 'eq' ) {
|
||
complain("Should 'eq' be '==' here ?\n");
|
||
}
|
||
elsif ( $last_nonblank_token eq 'ne' ) {
|
||
complain("Should 'ne' be '!=' here ?\n");
|
||
}
|
||
}
|
||
|
||
$last_last_nonblank_token = $last_nonblank_token;
|
||
$last_last_nonblank_type = $last_nonblank_type;
|
||
$last_last_nonblank_block_type = $last_nonblank_block_type;
|
||
$last_last_nonblank_container_type =
|
||
$last_nonblank_container_type;
|
||
$last_last_nonblank_type_sequence =
|
||
$last_nonblank_type_sequence;
|
||
$last_nonblank_token = $tok;
|
||
$last_nonblank_type = $type;
|
||
$last_nonblank_prototype = $prototype;
|
||
$last_nonblank_block_type = $block_type;
|
||
$last_nonblank_container_type = $container_type;
|
||
$last_nonblank_type_sequence = $type_sequence;
|
||
$last_nonblank_i = $i_tok;
|
||
}
|
||
|
||
# store previous token type
|
||
if ( $i_tok >= 0 ) {
|
||
$routput_token_type->[$i_tok] = $type;
|
||
$routput_block_type->[$i_tok] = $block_type;
|
||
$routput_container_type->[$i_tok] = $container_type;
|
||
$routput_type_sequence->[$i_tok] = $type_sequence;
|
||
$routput_indent_flag->[$i_tok] = $indent_flag;
|
||
}
|
||
my $pre_tok = $rtokens->[$i]; # get the next pre-token
|
||
my $pre_type = $rtoken_type->[$i]; # and type
|
||
$tok = $pre_tok;
|
||
$type = $pre_type; # to be modified as necessary
|
||
$block_type = ""; # blank for all tokens except code block braces
|
||
$container_type = ""; # blank for all tokens except some parens
|
||
$type_sequence = ""; # blank for all tokens except ?/:
|
||
$indent_flag = 0;
|
||
$prototype = ""; # blank for all tokens except user defined subs
|
||
$i_tok = $i;
|
||
|
||
# this pre-token will start an output token
|
||
push( @{$routput_token_list}, $i_tok );
|
||
|
||
# continue gathering identifier if necessary
|
||
# but do not start on blanks and comments
|
||
if ( $id_scan_state && $pre_type ne 'b' && $pre_type ne '#' ) {
|
||
|
||
if ( $is_sub{$id_scan_state} || $is_package{$id_scan_state} ) {
|
||
scan_id();
|
||
}
|
||
else {
|
||
scan_identifier();
|
||
}
|
||
|
||
last if ($id_scan_state);
|
||
next if ( ( $i > 0 ) || $type );
|
||
|
||
# didn't find any token; start over
|
||
$type = $pre_type;
|
||
$tok = $pre_tok;
|
||
}
|
||
|
||
# handle whitespace tokens..
|
||
next if ( $type eq 'b' );
|
||
my $prev_tok = $i > 0 ? $rtokens->[ $i - 1 ] : ' ';
|
||
my $prev_type = $i > 0 ? $rtoken_type->[ $i - 1 ] : 'b';
|
||
|
||
# Build larger tokens where possible, since we are not in a quote.
|
||
#
|
||
# First try to assemble digraphs. The following tokens are
|
||
# excluded and handled specially:
|
||
# '/=' is excluded because the / might start a pattern.
|
||
# 'x=' is excluded since it might be $x=, with $ on previous line
|
||
# '**' and *= might be typeglobs of punctuation variables
|
||
# I have allowed tokens starting with <, such as <=,
|
||
# because I don't think these could be valid angle operators.
|
||
# test file: storrs4.pl
|
||
my $test_tok = $tok . $rtokens->[ $i + 1 ];
|
||
my $combine_ok = $is_digraph{$test_tok};
|
||
|
||
# check for special cases which cannot be combined
|
||
if ($combine_ok) {
|
||
|
||
# '//' must be defined_or operator if an operator is expected.
|
||
# TODO: Code for other ambiguous digraphs (/=, x=, **, *=)
|
||
# could be migrated here for clarity
|
||
|
||
# Patch for RT#102371, misparsing a // in the following snippet:
|
||
# state $b //= ccc();
|
||
# The solution is to always accept the digraph (or trigraph) after
|
||
# token type 'Z' (possible file handle). The reason is that
|
||
# sub operator_expected gives TERM expected here, which is
|
||
# wrong in this case.
|
||
if ( $test_tok eq '//' && $last_nonblank_type ne 'Z' ) {
|
||
my $next_type = $rtokens->[ $i + 1 ];
|
||
my $expecting =
|
||
operator_expected( [ $prev_type, $tok, $next_type ] );
|
||
|
||
# Patched for RT#101547, was 'unless ($expecting==OPERATOR)'
|
||
$combine_ok = 0 if ( $expecting == TERM );
|
||
}
|
||
|
||
# Patch for RT #114359: Missparsing of "print $x ** 0.5;
|
||
# Accept the digraphs '**' only after type 'Z'
|
||
# Otherwise postpone the decision.
|
||
if ( $test_tok eq '**' ) {
|
||
if ( $last_nonblank_type ne 'Z' ) { $combine_ok = 0 }
|
||
}
|
||
}
|
||
|
||
if (
|
||
$combine_ok
|
||
|
||
&& ( $test_tok ne '/=' ) # might be pattern
|
||
&& ( $test_tok ne 'x=' ) # might be $x
|
||
&& ( $test_tok ne '*=' ) # typeglob?
|
||
|
||
# Moved above as part of fix for
|
||
# RT #114359: Missparsing of "print $x ** 0.5;
|
||
# && ( $test_tok ne '**' ) # typeglob?
|
||
)
|
||
{
|
||
$tok = $test_tok;
|
||
$i++;
|
||
|
||
# Now try to assemble trigraphs. Note that all possible
|
||
# perl trigraphs can be constructed by appending a character
|
||
# to a digraph.
|
||
$test_tok = $tok . $rtokens->[ $i + 1 ];
|
||
|
||
if ( $is_trigraph{$test_tok} ) {
|
||
$tok = $test_tok;
|
||
$i++;
|
||
}
|
||
|
||
# The only current tetragraph is the double diamond operator
|
||
# and its first three characters are not a trigraph, so
|
||
# we do can do a special test for it
|
||
elsif ( $test_tok eq '<<>' ) {
|
||
$test_tok .= $rtokens->[ $i + 2 ];
|
||
if ( $is_tetragraph{$test_tok} ) {
|
||
$tok = $test_tok;
|
||
$i += 2;
|
||
}
|
||
}
|
||
}
|
||
|
||
$type = $tok;
|
||
$next_tok = $rtokens->[ $i + 1 ];
|
||
$next_type = $rtoken_type->[ $i + 1 ];
|
||
|
||
DEBUG_TOKENIZE && do {
|
||
local $" = ')(';
|
||
my @debug_list = (
|
||
$last_nonblank_token, $tok,
|
||
$next_tok, $brace_depth,
|
||
$brace_type[$brace_depth], $paren_depth,
|
||
$paren_type[$paren_depth]
|
||
);
|
||
print STDOUT "TOKENIZE:(@debug_list)\n";
|
||
};
|
||
|
||
# turn off attribute list on first non-blank, non-bareword
|
||
if ( $pre_type ne 'w' ) { $in_attribute_list = 0 }
|
||
|
||
###############################################################
|
||
# We have the next token, $tok.
|
||
# Now we have to examine this token and decide what it is
|
||
# and define its $type
|
||
#
|
||
# section 1: bare words
|
||
###############################################################
|
||
|
||
if ( $pre_type eq 'w' ) {
|
||
$expecting =
|
||
operator_expected( [ $prev_type, $tok, $next_type ] );
|
||
my ( $next_nonblank_token, $i_next ) =
|
||
find_next_nonblank_token( $i, $rtokens, $max_token_index );
|
||
|
||
# ATTRS: handle sub and variable attributes
|
||
if ($in_attribute_list) {
|
||
|
||
# treat bare word followed by open paren like qw(
|
||
if ( $next_nonblank_token eq '(' ) {
|
||
|
||
# For something like:
|
||
# : prototype($$)
|
||
# we should let do_scan_sub see it so that it can see
|
||
# the prototype. All other attributes get parsed as a
|
||
# quoted string.
|
||
if ( $tok eq 'prototype' ) {
|
||
$id_scan_state = 'prototype';
|
||
|
||
# start just after the word 'prototype'
|
||
my $i_beg = $i + 1;
|
||
( $i, $tok, $type, $id_scan_state ) = do_scan_sub(
|
||
{
|
||
input_line => $input_line,
|
||
i => $i,
|
||
i_beg => $i_beg,
|
||
tok => $tok,
|
||
type => $type,
|
||
rtokens => $rtokens,
|
||
rtoken_map => $rtoken_map,
|
||
id_scan_state => $id_scan_state,
|
||
max_token_index => $max_token_index
|
||
}
|
||
);
|
||
|
||
# If successful, mark as type 'q' to be consistent with other
|
||
# attributes. Note that type 'w' would also work.
|
||
if ( $i > $i_beg ) {
|
||
$type = 'q';
|
||
next;
|
||
}
|
||
|
||
# If not successful, continue and parse as a quote.
|
||
}
|
||
|
||
# All other attribute lists must be parsed as quotes
|
||
# (see 'signatures.t' for good examples)
|
||
$in_quote = $quote_items{'q'};
|
||
$allowed_quote_modifiers = $quote_modifiers{'q'};
|
||
$type = 'q';
|
||
$quote_type = 'q';
|
||
next;
|
||
}
|
||
|
||
# handle bareword not followed by open paren
|
||
else {
|
||
$type = 'w';
|
||
next;
|
||
}
|
||
}
|
||
|
||
# quote a word followed by => operator
|
||
# unless the word __END__ or __DATA__ and the only word on
|
||
# the line.
|
||
if ( !$is_END_or_DATA && $next_nonblank_token eq '=' ) {
|
||
|
||
if ( $rtokens->[ $i_next + 1 ] eq '>' ) {
|
||
if ( $is_constant{$current_package}{$tok} ) {
|
||
$type = 'C';
|
||
}
|
||
elsif ( $is_user_function{$current_package}{$tok} ) {
|
||
$type = 'U';
|
||
$prototype =
|
||
$user_function_prototype{$current_package}{$tok};
|
||
}
|
||
elsif ( $tok =~ /^v\d+$/ ) {
|
||
$type = 'v';
|
||
report_v_string($tok);
|
||
}
|
||
else {
|
||
|
||
# Bareword followed by a fat comma ... see 'git18.in'
|
||
# If tok is something like 'x17' then it could
|
||
# actually be operator x followed by number 17.
|
||
# For example, here:
|
||
# 123x17 => [ 792, 1224 ],
|
||
# (a key of 123 repeated 17 times, perhaps not
|
||
# what was intended). We will mark x17 as type
|
||
# 'n' and it will be split. If the previous token
|
||
# was also a bareword then it is not very clear is
|
||
# going on. In this case we will not be sure that
|
||
# an operator is expected, so we just mark it as a
|
||
# bareword. Perl is a little murky in what it does
|
||
# with stuff like this, and its behavior can change
|
||
# over time. Something like
|
||
# a x18 => [792, 1224], will compile as
|
||
# a key with 18 a's. But something like
|
||
# push @array, a x18;
|
||
# is a syntax error.
|
||
if ( $expecting == OPERATOR && $tok =~ /^x\d+$/ ) {
|
||
$type = 'n';
|
||
}
|
||
else {
|
||
|
||
# git #18
|
||
$type = 'w';
|
||
error_if_expecting_OPERATOR();
|
||
}
|
||
}
|
||
|
||
next;
|
||
}
|
||
}
|
||
|
||
# quote a bare word within braces..like xxx->{s}; note that we
|
||
# must be sure this is not a structural brace, to avoid
|
||
# mistaking {s} in the following for a quoted bare word:
|
||
# for(@[){s}bla}BLA}
|
||
# Also treat q in something like var{-q} as a bare word, not qoute operator
|
||
if (
|
||
$next_nonblank_token eq '}'
|
||
&& (
|
||
$last_nonblank_type eq 'L'
|
||
|| ( $last_nonblank_type eq 'm'
|
||
&& $last_last_nonblank_type eq 'L' )
|
||
)
|
||
)
|
||
{
|
||
$type = 'w';
|
||
next;
|
||
}
|
||
|
||
# a bare word immediately followed by :: is not a keyword;
|
||
# use $tok_kw when testing for keywords to avoid a mistake
|
||
my $tok_kw = $tok;
|
||
if ( $rtokens->[ $i + 1 ] eq ':'
|
||
&& $rtokens->[ $i + 2 ] eq ':' )
|
||
{
|
||
$tok_kw .= '::';
|
||
}
|
||
|
||
# handle operator x (now we know it isn't $x=)
|
||
if ( $expecting == OPERATOR
|
||
&& substr( $tok, 0, 1 ) eq 'x'
|
||
&& $tok =~ /^x\d*$/ )
|
||
{
|
||
if ( $tok eq 'x' ) {
|
||
|
||
if ( $rtokens->[ $i + 1 ] eq '=' ) { # x=
|
||
$tok = 'x=';
|
||
$type = $tok;
|
||
$i++;
|
||
}
|
||
else {
|
||
$type = 'x';
|
||
}
|
||
}
|
||
|
||
# FIXME: Patch: mark something like x4 as an integer for now
|
||
# It gets fixed downstream. This is easier than
|
||
# splitting the pretoken.
|
||
else {
|
||
$type = 'n';
|
||
}
|
||
}
|
||
elsif ( $tok_kw eq 'CORE::' ) {
|
||
$type = $tok = $tok_kw;
|
||
$i += 2;
|
||
}
|
||
elsif ( ( $tok eq 'strict' )
|
||
and ( $last_nonblank_token eq 'use' ) )
|
||
{
|
||
$tokenizer_self->[_saw_use_strict_] = 1;
|
||
scan_bare_identifier();
|
||
}
|
||
|
||
elsif ( ( $tok eq 'warnings' )
|
||
and ( $last_nonblank_token eq 'use' ) )
|
||
{
|
||
$tokenizer_self->[_saw_perl_dash_w_] = 1;
|
||
|
||
# scan as identifier, so that we pick up something like:
|
||
# use warnings::register
|
||
scan_bare_identifier();
|
||
}
|
||
|
||
elsif (
|
||
$tok eq 'AutoLoader'
|
||
&& $tokenizer_self->[_look_for_autoloader_]
|
||
&& (
|
||
$last_nonblank_token eq 'use'
|
||
|
||
# these regexes are from AutoSplit.pm, which we want
|
||
# to mimic
|
||
|| $input_line =~ /^\s*(use|require)\s+AutoLoader\b/
|
||
|| $input_line =~ /\bISA\s*=.*\bAutoLoader\b/
|
||
)
|
||
)
|
||
{
|
||
write_logfile_entry("AutoLoader seen, -nlal deactivates\n");
|
||
$tokenizer_self->[_saw_autoloader_] = 1;
|
||
$tokenizer_self->[_look_for_autoloader_] = 0;
|
||
scan_bare_identifier();
|
||
}
|
||
|
||
elsif (
|
||
$tok eq 'SelfLoader'
|
||
&& $tokenizer_self->[_look_for_selfloader_]
|
||
&& ( $last_nonblank_token eq 'use'
|
||
|| $input_line =~ /^\s*(use|require)\s+SelfLoader\b/
|
||
|| $input_line =~ /\bISA\s*=.*\bSelfLoader\b/ )
|
||
)
|
||
{
|
||
write_logfile_entry("SelfLoader seen, -nlsl deactivates\n");
|
||
$tokenizer_self->[_saw_selfloader_] = 1;
|
||
$tokenizer_self->[_look_for_selfloader_] = 0;
|
||
scan_bare_identifier();
|
||
}
|
||
|
||
elsif ( ( $tok eq 'constant' )
|
||
and ( $last_nonblank_token eq 'use' ) )
|
||
{
|
||
scan_bare_identifier();
|
||
my ( $next_nonblank_token, $i_next ) =
|
||
find_next_nonblank_token( $i, $rtokens,
|
||
$max_token_index );
|
||
|
||
if ($next_nonblank_token) {
|
||
|
||
if ( $is_keyword{$next_nonblank_token} ) {
|
||
|
||
# Assume qw is used as a quote and okay, as in:
|
||
# use constant qw{ DEBUG 0 };
|
||
# Not worth trying to parse for just a warning
|
||
|
||
# NOTE: This warning is deactivated because recent
|
||
# versions of perl do not complain here, but
|
||
# the coding is retained for reference.
|
||
if ( 0 && $next_nonblank_token ne 'qw' ) {
|
||
warning(
|
||
"Attempting to define constant '$next_nonblank_token' which is a perl keyword\n"
|
||
);
|
||
}
|
||
}
|
||
|
||
# FIXME: could check for error in which next token is
|
||
# not a word (number, punctuation, ..)
|
||
else {
|
||
$is_constant{$current_package}{$next_nonblank_token}
|
||
= 1;
|
||
}
|
||
}
|
||
}
|
||
|
||
# various quote operators
|
||
elsif ( $is_q_qq_qw_qx_qr_s_y_tr_m{$tok} ) {
|
||
##NICOL PATCH
|
||
if ( $expecting == OPERATOR ) {
|
||
|
||
# Be careful not to call an error for a qw quote
|
||
# where a parenthesized list is allowed. For example,
|
||
# it could also be a for/foreach construct such as
|
||
#
|
||
# foreach my $key qw\Uno Due Tres Quadro\ {
|
||
# print "Set $key\n";
|
||
# }
|
||
#
|
||
|
||
# Or it could be a function call.
|
||
# NOTE: Braces in something like &{ xxx } are not
|
||
# marked as a block, we might have a method call.
|
||
# &method(...), $method->(..), &{method}(...),
|
||
# $ref[2](list) is ok & short for $ref[2]->(list)
|
||
#
|
||
# See notes in 'sub code_block_type' and
|
||
# 'sub is_non_structural_brace'
|
||
|
||
unless (
|
||
$tok eq 'qw'
|
||
&& ( $last_nonblank_token =~ /^([\]\}\&]|\-\>)/
|
||
|| $is_for_foreach{$want_paren} )
|
||
)
|
||
{
|
||
error_if_expecting_OPERATOR();
|
||
}
|
||
}
|
||
$in_quote = $quote_items{$tok};
|
||
$allowed_quote_modifiers = $quote_modifiers{$tok};
|
||
|
||
# All quote types are 'Q' except possibly qw quotes.
|
||
# qw quotes are special in that they may generally be trimmed
|
||
# of leading and trailing whitespace. So they are given a
|
||
# separate type, 'q', unless requested otherwise.
|
||
$type =
|
||
( $tok eq 'qw' && $tokenizer_self->[_trim_qw_] )
|
||
? 'q'
|
||
: 'Q';
|
||
$quote_type = $type;
|
||
}
|
||
|
||
# check for a statement label
|
||
elsif (
|
||
( $next_nonblank_token eq ':' )
|
||
&& ( $rtokens->[ $i_next + 1 ] ne ':' )
|
||
&& ( $i_next <= $max_token_index ) # colon on same line
|
||
&& label_ok()
|
||
)
|
||
{
|
||
if ( $tok !~ /[A-Z]/ ) {
|
||
push @{ $tokenizer_self->[_rlower_case_labels_at_] },
|
||
$input_line_number;
|
||
}
|
||
$type = 'J';
|
||
$tok .= ':';
|
||
$i = $i_next;
|
||
next;
|
||
}
|
||
|
||
# 'sub' or alias
|
||
elsif ( $is_sub{$tok_kw} ) {
|
||
error_if_expecting_OPERATOR()
|
||
if ( $expecting == OPERATOR );
|
||
initialize_subname();
|
||
scan_id();
|
||
}
|
||
|
||
# 'package'
|
||
elsif ( $is_package{$tok_kw} ) {
|
||
error_if_expecting_OPERATOR()
|
||
if ( $expecting == OPERATOR );
|
||
scan_id();
|
||
}
|
||
|
||
# Note on token types for format, __DATA__, __END__:
|
||
# It simplifies things to give these type ';', so that when we
|
||
# start rescanning we will be expecting a token of type TERM.
|
||
# We will switch to type 'k' before outputting the tokens.
|
||
elsif ( $is_format_END_DATA{$tok_kw} ) {
|
||
$type = ';'; # make tokenizer look for TERM next
|
||
|
||
# Remember that we are in one of these three sections
|
||
$tokenizer_self->[ $is_format_END_DATA{$tok_kw} ] = 1;
|
||
last;
|
||
}
|
||
|
||
elsif ( $is_keyword{$tok_kw} ) {
|
||
$type = 'k';
|
||
|
||
# Since for and foreach may not be followed immediately
|
||
# by an opening paren, we have to remember which keyword
|
||
# is associated with the next '('
|
||
if ( $is_for_foreach{$tok} ) {
|
||
if ( new_statement_ok() ) {
|
||
$want_paren = $tok;
|
||
}
|
||
}
|
||
|
||
# recognize 'use' statements, which are special
|
||
elsif ( $is_use_require{$tok} ) {
|
||
$statement_type = $tok;
|
||
error_if_expecting_OPERATOR()
|
||
if ( $expecting == OPERATOR );
|
||
}
|
||
|
||
# remember my and our to check for trailing ": shared"
|
||
elsif ( $is_my_our_state{$tok} ) {
|
||
$statement_type = $tok;
|
||
}
|
||
|
||
# Check for misplaced 'elsif' and 'else', but allow isolated
|
||
# else or elsif blocks to be formatted. This is indicated
|
||
# by a last noblank token of ';'
|
||
elsif ( $tok eq 'elsif' ) {
|
||
if ( $last_nonblank_token ne ';'
|
||
&& $last_nonblank_block_type !~
|
||
/^(if|elsif|unless)$/ )
|
||
{
|
||
warning(
|
||
"expecting '$tok' to follow one of 'if|elsif|unless'\n"
|
||
);
|
||
}
|
||
}
|
||
elsif ( $tok eq 'else' ) {
|
||
|
||
# patched for SWITCH/CASE
|
||
if (
|
||
$last_nonblank_token ne ';'
|
||
&& $last_nonblank_block_type !~
|
||
/^(if|elsif|unless|case|when)$/
|
||
|
||
# patch to avoid an unwanted error message for
|
||
# the case of a parenless 'case' (RT 105484):
|
||
# switch ( 1 ) { case x { 2 } else { } }
|
||
&& $statement_type !~
|
||
/^(if|elsif|unless|case|when)$/
|
||
)
|
||
{
|
||
warning(
|
||
"expecting '$tok' to follow one of 'if|elsif|unless|case|when'\n"
|
||
);
|
||
}
|
||
}
|
||
elsif ( $tok eq 'continue' ) {
|
||
if ( $last_nonblank_token ne ';'
|
||
&& $last_nonblank_block_type !~
|
||
/(^(\{|\}|;|while|until|for|foreach)|:$)/ )
|
||
{
|
||
|
||
# note: ';' '{' and '}' in list above
|
||
# because continues can follow bare blocks;
|
||
# ':' is labeled block
|
||
#
|
||
############################################
|
||
# NOTE: This check has been deactivated because
|
||
# continue has an alternative usage for given/when
|
||
# blocks in perl 5.10
|
||
## warning("'$tok' should follow a block\n");
|
||
############################################
|
||
}
|
||
}
|
||
|
||
# patch for SWITCH/CASE if 'case' and 'when are
|
||
# treated as keywords. Also 'default' for Switch::Plain
|
||
elsif ($tok eq 'when'
|
||
|| $tok eq 'case'
|
||
|| $tok eq 'default' )
|
||
{
|
||
$statement_type = $tok; # next '{' is block
|
||
}
|
||
|
||
#
|
||
# indent trailing if/unless/while/until
|
||
# outdenting will be handled by later indentation loop
|
||
## DEACTIVATED: unfortunately this can cause some unwanted indentation like:
|
||
##$opt_o = 1
|
||
## if !(
|
||
## $opt_b
|
||
## || $opt_c
|
||
## || $opt_d
|
||
## || $opt_f
|
||
## || $opt_i
|
||
## || $opt_l
|
||
## || $opt_o
|
||
## || $opt_x
|
||
## );
|
||
## if ( $tok =~ /^(if|unless|while|until)$/
|
||
## && $next_nonblank_token ne '(' )
|
||
## {
|
||
## $indent_flag = 1;
|
||
## }
|
||
}
|
||
|
||
# check for inline label following
|
||
# /^(redo|last|next|goto)$/
|
||
elsif (( $last_nonblank_type eq 'k' )
|
||
&& ( $is_redo_last_next_goto{$last_nonblank_token} ) )
|
||
{
|
||
$type = 'j';
|
||
next;
|
||
}
|
||
|
||
# something else --
|
||
else {
|
||
|
||
scan_bare_identifier();
|
||
|
||
if ( $statement_type eq 'use'
|
||
&& $last_nonblank_token eq 'use' )
|
||
{
|
||
$saw_use_module{$current_package}->{$tok} = 1;
|
||
}
|
||
|
||
if ( $type eq 'w' ) {
|
||
|
||
if ( $expecting == OPERATOR ) {
|
||
|
||
# Patch to avoid error message for RPerl overloaded
|
||
# operator functions: use overload
|
||
# '+' => \&sse_add,
|
||
# '-' => \&sse_sub,
|
||
# '*' => \&sse_mul,
|
||
# '/' => \&sse_div;
|
||
# FIXME: this should eventually be generalized
|
||
if ( $saw_use_module{$current_package}->{'RPerl'}
|
||
&& $tok =~ /^sse_(mul|div|add|sub)$/ )
|
||
{
|
||
|
||
}
|
||
|
||
# don't complain about possible indirect object
|
||
# notation.
|
||
# For example:
|
||
# package main;
|
||
# sub new($) { ... }
|
||
# $b = new A::; # calls A::new
|
||
# $c = new A; # same thing but suspicious
|
||
# This will call A::new but we have a 'new' in
|
||
# main:: which looks like a constant.
|
||
#
|
||
elsif ( $last_nonblank_type eq 'C' ) {
|
||
if ( $tok !~ /::$/ ) {
|
||
complain(<<EOM);
|
||
Expecting operator after '$last_nonblank_token' but found bare word '$tok'
|
||
Maybe indirectet object notation?
|
||
EOM
|
||
}
|
||
}
|
||
else {
|
||
error_if_expecting_OPERATOR("bareword");
|
||
}
|
||
}
|
||
|
||
# mark bare words immediately followed by a paren as
|
||
# functions
|
||
$next_tok = $rtokens->[ $i + 1 ];
|
||
if ( $next_tok eq '(' ) {
|
||
$type = 'U';
|
||
}
|
||
|
||
# underscore after file test operator is file handle
|
||
if ( $tok eq '_' && $last_nonblank_type eq 'F' ) {
|
||
$type = 'Z';
|
||
}
|
||
|
||
# patch for SWITCH/CASE if 'case' and 'when are
|
||
# not treated as keywords:
|
||
if (
|
||
(
|
||
$tok eq 'case'
|
||
&& $brace_type[$brace_depth] eq 'switch'
|
||
)
|
||
|| ( $tok eq 'when'
|
||
&& $brace_type[$brace_depth] eq 'given' )
|
||
)
|
||
{
|
||
$statement_type = $tok; # next '{' is block
|
||
$type = 'k'; # for keyword syntax coloring
|
||
}
|
||
|
||
# patch for SWITCH/CASE if switch and given not keywords
|
||
# Switch is not a perl 5 keyword, but we will gamble
|
||
# and mark switch followed by paren as a keyword. This
|
||
# is only necessary to get html syntax coloring nice,
|
||
# and does not commit this as being a switch/case.
|
||
if ( $next_nonblank_token eq '('
|
||
&& ( $tok eq 'switch' || $tok eq 'given' ) )
|
||
{
|
||
$type = 'k'; # for keyword syntax coloring
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
###############################################################
|
||
# section 2: strings of digits
|
||
###############################################################
|
||
elsif ( $pre_type eq 'd' ) {
|
||
$expecting =
|
||
operator_expected( [ $prev_type, $tok, $next_type ] );
|
||
error_if_expecting_OPERATOR("Number")
|
||
if ( $expecting == OPERATOR );
|
||
|
||
my $number = scan_number_fast();
|
||
if ( !defined($number) ) {
|
||
|
||
# shouldn't happen - we should always get a number
|
||
warning("non-number beginning with digit--program bug\n");
|
||
report_definite_bug();
|
||
}
|
||
}
|
||
|
||
###############################################################
|
||
# section 3: all other tokens
|
||
###############################################################
|
||
|
||
else {
|
||
last if ( $tok eq '#' );
|
||
my $code = $tokenization_code->{$tok};
|
||
if ($code) {
|
||
$expecting =
|
||
operator_expected( [ $prev_type, $tok, $next_type ] );
|
||
$code->();
|
||
redo if $in_quote;
|
||
}
|
||
}
|
||
}
|
||
|
||
# -----------------------------
|
||
# end of main tokenization loop
|
||
# -----------------------------
|
||
|
||
if ( $i_tok >= 0 ) {
|
||
$routput_token_type->[$i_tok] = $type;
|
||
$routput_block_type->[$i_tok] = $block_type;
|
||
$routput_container_type->[$i_tok] = $container_type;
|
||
$routput_type_sequence->[$i_tok] = $type_sequence;
|
||
$routput_indent_flag->[$i_tok] = $indent_flag;
|
||
}
|
||
|
||
unless ( ( $type eq 'b' ) || ( $type eq '#' ) ) {
|
||
$last_last_nonblank_token = $last_nonblank_token;
|
||
$last_last_nonblank_type = $last_nonblank_type;
|
||
$last_last_nonblank_block_type = $last_nonblank_block_type;
|
||
$last_last_nonblank_container_type = $last_nonblank_container_type;
|
||
$last_last_nonblank_type_sequence = $last_nonblank_type_sequence;
|
||
$last_nonblank_token = $tok;
|
||
$last_nonblank_type = $type;
|
||
$last_nonblank_block_type = $block_type;
|
||
$last_nonblank_container_type = $container_type;
|
||
$last_nonblank_type_sequence = $type_sequence;
|
||
$last_nonblank_prototype = $prototype;
|
||
}
|
||
|
||
# reset indentation level if necessary at a sub or package
|
||
# in an attempt to recover from a nesting error
|
||
if ( $level_in_tokenizer < 0 ) {
|
||
if ( $input_line =~ /^\s*(sub|package)\s+(\w+)/ ) {
|
||
reset_indentation_level(0);
|
||
brace_warning("resetting level to 0 at $1 $2\n");
|
||
}
|
||
}
|
||
|
||
# all done tokenizing this line ...
|
||
# now prepare the final list of tokens and types
|
||
|
||
my @token_type = (); # stack of output token types
|
||
my @block_type = (); # stack of output code block types
|
||
my @container_type = (); # stack of output code container types
|
||
my @type_sequence = (); # stack of output type sequence numbers
|
||
my @tokens = (); # output tokens
|
||
my @levels = (); # structural brace levels of output tokens
|
||
my @slevels = (); # secondary nesting levels of output tokens
|
||
my @nesting_tokens = (); # string of tokens leading to this depth
|
||
my @nesting_types = (); # string of token types leading to this depth
|
||
my @nesting_blocks = (); # string of block types leading to this depth
|
||
my @nesting_lists = (); # string of list types leading to this depth
|
||
my @ci_string = (); # string needed to compute continuation indentation
|
||
my @container_environment = (); # BLOCK or LIST
|
||
my $container_environment = '';
|
||
my $im = -1; # previous $i value
|
||
my $num;
|
||
|
||
# Count the number of '1's in the string (previously sub ones_count)
|
||
my $ci_string_sum = ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
|
||
|
||
# Computing Token Indentation
|
||
#
|
||
# The final section of the tokenizer forms tokens and also computes
|
||
# parameters needed to find indentation. It is much easier to do it
|
||
# in the tokenizer than elsewhere. Here is a brief description of how
|
||
# indentation is computed. Perl::Tidy computes indentation as the sum
|
||
# of 2 terms:
|
||
#
|
||
# (1) structural indentation, such as if/else/elsif blocks
|
||
# (2) continuation indentation, such as long parameter call lists.
|
||
#
|
||
# These are occasionally called primary and secondary indentation.
|
||
#
|
||
# Structural indentation is introduced by tokens of type '{', although
|
||
# the actual tokens might be '{', '(', or '['. Structural indentation
|
||
# is of two types: BLOCK and non-BLOCK. Default structural indentation
|
||
# is 4 characters if the standard indentation scheme is used.
|
||
#
|
||
# Continuation indentation is introduced whenever a line at BLOCK level
|
||
# is broken before its termination. Default continuation indentation
|
||
# is 2 characters in the standard indentation scheme.
|
||
#
|
||
# Both types of indentation may be nested arbitrarily deep and
|
||
# interlaced. The distinction between the two is somewhat arbitrary.
|
||
#
|
||
# For each token, we will define two variables which would apply if
|
||
# the current statement were broken just before that token, so that
|
||
# that token started a new line:
|
||
#
|
||
# $level = the structural indentation level,
|
||
# $ci_level = the continuation indentation level
|
||
#
|
||
# The total indentation will be $level * (4 spaces) + $ci_level * (2 spaces),
|
||
# assuming defaults. However, in some special cases it is customary
|
||
# to modify $ci_level from this strict value.
|
||
#
|
||
# The total structural indentation is easy to compute by adding and
|
||
# subtracting 1 from a saved value as types '{' and '}' are seen. The
|
||
# running value of this variable is $level_in_tokenizer.
|
||
#
|
||
# The total continuation is much more difficult to compute, and requires
|
||
# several variables. These variables are:
|
||
#
|
||
# $ci_string_in_tokenizer = a string of 1's and 0's indicating, for
|
||
# each indentation level, if there are intervening open secondary
|
||
# structures just prior to that level.
|
||
# $continuation_string_in_tokenizer = a string of 1's and 0's indicating
|
||
# if the last token at that level is "continued", meaning that it
|
||
# is not the first token of an expression.
|
||
# $nesting_block_string = a string of 1's and 0's indicating, for each
|
||
# indentation level, if the level is of type BLOCK or not.
|
||
# $nesting_block_flag = the most recent 1 or 0 of $nesting_block_string
|
||
# $nesting_list_string = a string of 1's and 0's indicating, for each
|
||
# indentation level, if it is appropriate for list formatting.
|
||
# If so, continuation indentation is used to indent long list items.
|
||
# $nesting_list_flag = the most recent 1 or 0 of $nesting_list_string
|
||
# @{$rslevel_stack} = a stack of total nesting depths at each
|
||
# structural indentation level, where "total nesting depth" means
|
||
# the nesting depth that would occur if every nesting token -- '{', '[',
|
||
# and '(' -- , regardless of context, is used to compute a nesting
|
||
# depth.
|
||
|
||
#my $nesting_block_flag = ($nesting_block_string =~ /1$/);
|
||
#my $nesting_list_flag = ($nesting_list_string =~ /1$/);
|
||
|
||
my ( $ci_string_i, $level_i, $nesting_block_string_i,
|
||
$nesting_list_string_i, $nesting_token_string_i,
|
||
$nesting_type_string_i, );
|
||
|
||
foreach my $i ( @{$routput_token_list} )
|
||
{ # scan the list of pre-tokens indexes
|
||
|
||
# self-checking for valid token types
|
||
my $type = $routput_token_type->[$i];
|
||
my $forced_indentation_flag = $routput_indent_flag->[$i];
|
||
|
||
# See if we should undo the $forced_indentation_flag.
|
||
# Forced indentation after 'if', 'unless', 'while' and 'until'
|
||
# expressions without trailing parens is optional and doesn't
|
||
# always look good. It is usually okay for a trailing logical
|
||
# expression, but if the expression is a function call, code block,
|
||
# or some kind of list it puts in an unwanted extra indentation
|
||
# level which is hard to remove.
|
||
#
|
||
# Example where extra indentation looks ok:
|
||
# return 1
|
||
# if $det_a < 0 and $det_b > 0
|
||
# or $det_a > 0 and $det_b < 0;
|
||
#
|
||
# Example where extra indentation is not needed because
|
||
# the eval brace also provides indentation:
|
||
# print "not " if defined eval {
|
||
# reduce { die if $b > 2; $a + $b } 0, 1, 2, 3, 4;
|
||
# };
|
||
#
|
||
# The following rule works fairly well:
|
||
# Undo the flag if the end of this line, or start of the next
|
||
# line, is an opening container token or a comma.
|
||
# This almost always works, but if not after another pass it will
|
||
# be stable.
|
||
if ( $forced_indentation_flag && $type eq 'k' ) {
|
||
my $ixlast = -1;
|
||
my $ilast = $routput_token_list->[$ixlast];
|
||
my $toklast = $routput_token_type->[$ilast];
|
||
if ( $toklast eq '#' ) {
|
||
$ixlast--;
|
||
$ilast = $routput_token_list->[$ixlast];
|
||
$toklast = $routput_token_type->[$ilast];
|
||
}
|
||
if ( $toklast eq 'b' ) {
|
||
$ixlast--;
|
||
$ilast = $routput_token_list->[$ixlast];
|
||
$toklast = $routput_token_type->[$ilast];
|
||
}
|
||
if ( $toklast =~ /^[\{,]$/ ) {
|
||
$forced_indentation_flag = 0;
|
||
}
|
||
else {
|
||
( $toklast, my $i_next ) =
|
||
find_next_nonblank_token( $max_token_index, $rtokens,
|
||
$max_token_index );
|
||
if ( $toklast =~ /^[\{,]$/ ) {
|
||
$forced_indentation_flag = 0;
|
||
}
|
||
}
|
||
}
|
||
|
||
# if we are already in an indented if, see if we should outdent
|
||
if ($indented_if_level) {
|
||
|
||
# don't try to nest trailing if's - shouldn't happen
|
||
if ( $type eq 'k' ) {
|
||
$forced_indentation_flag = 0;
|
||
}
|
||
|
||
# check for the normal case - outdenting at next ';'
|
||
elsif ( $type eq ';' ) {
|
||
if ( $level_in_tokenizer == $indented_if_level ) {
|
||
$forced_indentation_flag = -1;
|
||
$indented_if_level = 0;
|
||
}
|
||
}
|
||
|
||
# handle case of missing semicolon
|
||
elsif ( $type eq '}' ) {
|
||
if ( $level_in_tokenizer == $indented_if_level ) {
|
||
$indented_if_level = 0;
|
||
|
||
# TBD: This could be a subroutine call
|
||
$level_in_tokenizer--;
|
||
if ( @{$rslevel_stack} > 1 ) {
|
||
pop( @{$rslevel_stack} );
|
||
}
|
||
if ( length($nesting_block_string) > 1 )
|
||
{ # true for valid script
|
||
chop $nesting_block_string;
|
||
chop $nesting_list_string;
|
||
}
|
||
|
||
}
|
||
}
|
||
}
|
||
|
||
my $tok = $rtokens->[$i]; # the token, but ONLY if same as pretoken
|
||
$level_i = $level_in_tokenizer;
|
||
|
||
# This can happen by running perltidy on non-scripts
|
||
# although it could also be bug introduced by programming change.
|
||
# Perl silently accepts a 032 (^Z) and takes it as the end
|
||
if ( !$is_valid_token_type{$type} ) {
|
||
my $val = ord($type);
|
||
warning(
|
||
"unexpected character decimal $val ($type) in script\n");
|
||
$tokenizer_self->[_in_error_] = 1;
|
||
}
|
||
|
||
# ----------------------------------------------------------------
|
||
# TOKEN TYPE PATCHES
|
||
# output __END__, __DATA__, and format as type 'k' instead of ';'
|
||
# to make html colors correct, etc.
|
||
my $fix_type = $type;
|
||
if ( $type eq ';' && $tok =~ /\w/ ) { $fix_type = 'k' }
|
||
|
||
# output anonymous 'sub' as keyword
|
||
if ( $type eq 't' && $is_sub{$tok} ) { $fix_type = 'k' }
|
||
|
||
# -----------------------------------------------------------------
|
||
|
||
$nesting_token_string_i = $nesting_token_string;
|
||
$nesting_type_string_i = $nesting_type_string;
|
||
$nesting_block_string_i = $nesting_block_string;
|
||
$nesting_list_string_i = $nesting_list_string;
|
||
|
||
# set primary indentation levels based on structural braces
|
||
# Note: these are set so that the leading braces have a HIGHER
|
||
# level than their CONTENTS, which is convenient for indentation
|
||
# Also, define continuation indentation for each token.
|
||
if ( $type eq '{' || $type eq 'L' || $forced_indentation_flag > 0 )
|
||
{
|
||
|
||
# use environment before updating
|
||
$container_environment =
|
||
$nesting_block_flag ? 'BLOCK'
|
||
: $nesting_list_flag ? 'LIST'
|
||
: "";
|
||
|
||
# if the difference between total nesting levels is not 1,
|
||
# there are intervening non-structural nesting types between
|
||
# this '{' and the previous unclosed '{'
|
||
my $intervening_secondary_structure = 0;
|
||
if ( @{$rslevel_stack} ) {
|
||
$intervening_secondary_structure =
|
||
$slevel_in_tokenizer - $rslevel_stack->[-1];
|
||
}
|
||
|
||
# Continuation Indentation
|
||
#
|
||
# Having tried setting continuation indentation both in the formatter and
|
||
# in the tokenizer, I can say that setting it in the tokenizer is much,
|
||
# much easier. The formatter already has too much to do, and can't
|
||
# make decisions on line breaks without knowing what 'ci' will be at
|
||
# arbitrary locations.
|
||
#
|
||
# But a problem with setting the continuation indentation (ci) here
|
||
# in the tokenizer is that we do not know where line breaks will actually
|
||
# be. As a result, we don't know if we should propagate continuation
|
||
# indentation to higher levels of structure.
|
||
#
|
||
# For nesting of only structural indentation, we never need to do this.
|
||
# For example, in a long if statement, like this
|
||
#
|
||
# if ( !$output_block_type[$i]
|
||
# && ($in_statement_continuation) )
|
||
# { <--outdented
|
||
# do_something();
|
||
# }
|
||
#
|
||
# the second line has ci but we do normally give the lines within the BLOCK
|
||
# any ci. This would be true if we had blocks nested arbitrarily deeply.
|
||
#
|
||
# But consider something like this, where we have created a break after
|
||
# an opening paren on line 1, and the paren is not (currently) a
|
||
# structural indentation token:
|
||
#
|
||
# my $file = $menubar->Menubutton(
|
||
# qw/-text File -underline 0 -menuitems/ => [
|
||
# [
|
||
# Cascade => '~View',
|
||
# -menuitems => [
|
||
# ...
|
||
#
|
||
# The second line has ci, so it would seem reasonable to propagate it
|
||
# down, giving the third line 1 ci + 1 indentation. This suggests the
|
||
# following rule, which is currently used to propagating ci down: if there
|
||
# are any non-structural opening parens (or brackets, or braces), before
|
||
# an opening structural brace, then ci is propagated down, and otherwise
|
||
# not. The variable $intervening_secondary_structure contains this
|
||
# information for the current token, and the string
|
||
# "$ci_string_in_tokenizer" is a stack of previous values of this
|
||
# variable.
|
||
|
||
# save the current states
|
||
push( @{$rslevel_stack}, 1 + $slevel_in_tokenizer );
|
||
$level_in_tokenizer++;
|
||
|
||
if ( $level_in_tokenizer > $tokenizer_self->[_maximum_level_] )
|
||
{
|
||
$tokenizer_self->[_maximum_level_] = $level_in_tokenizer;
|
||
}
|
||
|
||
if ($forced_indentation_flag) {
|
||
|
||
# break BEFORE '?' when there is forced indentation
|
||
if ( $type eq '?' ) { $level_i = $level_in_tokenizer; }
|
||
if ( $type eq 'k' ) {
|
||
$indented_if_level = $level_in_tokenizer;
|
||
}
|
||
|
||
# do not change container environment here if we are not
|
||
# at a real list. Adding this check prevents "blinkers"
|
||
# often near 'unless" clauses, such as in the following
|
||
# code:
|
||
## next
|
||
## unless -e (
|
||
## $archive =
|
||
## File::Spec->catdir( $_, "auto", $root, "$sub$lib_ext" )
|
||
## );
|
||
|
||
$nesting_block_string .= "$nesting_block_flag";
|
||
}
|
||
else {
|
||
|
||
if ( $routput_block_type->[$i] ) {
|
||
$nesting_block_flag = 1;
|
||
$nesting_block_string .= '1';
|
||
}
|
||
else {
|
||
$nesting_block_flag = 0;
|
||
$nesting_block_string .= '0';
|
||
}
|
||
}
|
||
|
||
# we will use continuation indentation within containers
|
||
# which are not blocks and not logical expressions
|
||
my $bit = 0;
|
||
if ( !$routput_block_type->[$i] ) {
|
||
|
||
# propagate flag down at nested open parens
|
||
if ( $routput_container_type->[$i] eq '(' ) {
|
||
$bit = 1 if $nesting_list_flag;
|
||
}
|
||
|
||
# use list continuation if not a logical grouping
|
||
# /^(if|elsif|unless|while|and|or|not|&&|!|\|\||for|foreach)$/
|
||
else {
|
||
$bit = 1
|
||
unless
|
||
$is_logical_container{ $routput_container_type->[$i]
|
||
};
|
||
}
|
||
}
|
||
$nesting_list_string .= $bit;
|
||
$nesting_list_flag = $bit;
|
||
|
||
$ci_string_in_tokenizer .=
|
||
( $intervening_secondary_structure != 0 ) ? '1' : '0';
|
||
$ci_string_sum =
|
||
( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
|
||
$continuation_string_in_tokenizer .=
|
||
( $in_statement_continuation > 0 ) ? '1' : '0';
|
||
|
||
# Sometimes we want to give an opening brace continuation indentation,
|
||
# and sometimes not. For code blocks, we don't do it, so that the leading
|
||
# '{' gets outdented, like this:
|
||
#
|
||
# if ( !$output_block_type[$i]
|
||
# && ($in_statement_continuation) )
|
||
# { <--outdented
|
||
#
|
||
# For other types, we will give them continuation indentation. For example,
|
||
# here is how a list looks with the opening paren indented:
|
||
#
|
||
# @LoL =
|
||
# ( [ "fred", "barney" ], [ "george", "jane", "elroy" ],
|
||
# [ "homer", "marge", "bart" ], );
|
||
#
|
||
# This looks best when 'ci' is one-half of the indentation (i.e., 2 and 4)
|
||
|
||
my $total_ci = $ci_string_sum;
|
||
if (
|
||
!$routput_block_type->[$i] # patch: skip for BLOCK
|
||
&& ($in_statement_continuation)
|
||
&& !( $forced_indentation_flag && $type eq ':' )
|
||
)
|
||
{
|
||
$total_ci += $in_statement_continuation
|
||
unless ( substr( $ci_string_in_tokenizer, -1 ) eq '1' );
|
||
}
|
||
|
||
$ci_string_i = $total_ci;
|
||
$in_statement_continuation = 0;
|
||
}
|
||
|
||
elsif ($type eq '}'
|
||
|| $type eq 'R'
|
||
|| $forced_indentation_flag < 0 )
|
||
{
|
||
|
||
# only a nesting error in the script would prevent popping here
|
||
if ( @{$rslevel_stack} > 1 ) { pop( @{$rslevel_stack} ); }
|
||
|
||
$level_i = --$level_in_tokenizer;
|
||
|
||
# restore previous level values
|
||
if ( length($nesting_block_string) > 1 )
|
||
{ # true for valid script
|
||
chop $nesting_block_string;
|
||
$nesting_block_flag =
|
||
substr( $nesting_block_string, -1 ) eq '1';
|
||
chop $nesting_list_string;
|
||
$nesting_list_flag =
|
||
substr( $nesting_list_string, -1 ) eq '1';
|
||
|
||
chop $ci_string_in_tokenizer;
|
||
$ci_string_sum =
|
||
( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
|
||
|
||
$in_statement_continuation =
|
||
chop $continuation_string_in_tokenizer;
|
||
|
||
# zero continuation flag at terminal BLOCK '}' which
|
||
# ends a statement.
|
||
if ( $routput_block_type->[$i] ) {
|
||
|
||
# ...These include non-anonymous subs
|
||
# note: could be sub ::abc { or sub 'abc
|
||
if ( $routput_block_type->[$i] =~ m/^sub\s*/gc ) {
|
||
|
||
# note: older versions of perl require the /gc modifier
|
||
# here or else the \G does not work.
|
||
if ( $routput_block_type->[$i] =~ /\G('|::|\w)/gc )
|
||
{
|
||
$in_statement_continuation = 0;
|
||
}
|
||
}
|
||
|
||
# ...and include all block types except user subs with
|
||
# block prototypes and these: (sort|grep|map|do|eval)
|
||
# /^(\}|\{|BEGIN|END|CHECK|INIT|AUTOLOAD|DESTROY|UNITCHECK|continue|;|if|elsif|else|unless|while|until|for|foreach)$/
|
||
elsif (
|
||
$is_zero_continuation_block_type{
|
||
$routput_block_type->[$i] } )
|
||
{
|
||
$in_statement_continuation = 0;
|
||
}
|
||
|
||
# ..but these are not terminal types:
|
||
# /^(sort|grep|map|do|eval)$/ )
|
||
elsif (
|
||
$is_not_zero_continuation_block_type{
|
||
$routput_block_type->[$i] } )
|
||
{
|
||
}
|
||
|
||
# ..and a block introduced by a label
|
||
# /^\w+\s*:$/gc ) {
|
||
elsif ( $routput_block_type->[$i] =~ /:$/ ) {
|
||
$in_statement_continuation = 0;
|
||
}
|
||
|
||
# user function with block prototype
|
||
else {
|
||
$in_statement_continuation = 0;
|
||
}
|
||
}
|
||
|
||
# If we are in a list, then
|
||
# we must set continuation indentation at the closing
|
||
# paren of something like this (paren after $check):
|
||
# assert(
|
||
# __LINE__,
|
||
# ( not defined $check )
|
||
# or ref $check
|
||
# or $check eq "new"
|
||
# or $check eq "old",
|
||
# );
|
||
elsif ( $tok eq ')' ) {
|
||
$in_statement_continuation = 1
|
||
if $routput_container_type->[$i] =~ /^[;,\{\}]$/;
|
||
}
|
||
|
||
elsif ( $tok eq ';' ) { $in_statement_continuation = 0 }
|
||
}
|
||
|
||
# use environment after updating
|
||
$container_environment =
|
||
$nesting_block_flag ? 'BLOCK'
|
||
: $nesting_list_flag ? 'LIST'
|
||
: "";
|
||
$ci_string_i = $ci_string_sum + $in_statement_continuation;
|
||
$nesting_block_string_i = $nesting_block_string;
|
||
$nesting_list_string_i = $nesting_list_string;
|
||
}
|
||
|
||
# not a structural indentation type..
|
||
else {
|
||
|
||
$container_environment =
|
||
$nesting_block_flag ? 'BLOCK'
|
||
: $nesting_list_flag ? 'LIST'
|
||
: "";
|
||
|
||
# zero the continuation indentation at certain tokens so
|
||
# that they will be at the same level as its container. For
|
||
# commas, this simplifies the -lp indentation logic, which
|
||
# counts commas. For ?: it makes them stand out.
|
||
if ($nesting_list_flag) {
|
||
## $type =~ /^[,\?\:]$/
|
||
if ( $is_comma_question_colon{$type} ) {
|
||
$in_statement_continuation = 0;
|
||
}
|
||
}
|
||
|
||
# be sure binary operators get continuation indentation
|
||
if (
|
||
$container_environment
|
||
&& ( $type eq 'k' && $is_binary_keyword{$tok}
|
||
|| $is_binary_type{$type} )
|
||
)
|
||
{
|
||
$in_statement_continuation = 1;
|
||
}
|
||
|
||
# continuation indentation is sum of any open ci from previous
|
||
# levels plus the current level
|
||
$ci_string_i = $ci_string_sum + $in_statement_continuation;
|
||
|
||
# update continuation flag ...
|
||
# if this isn't a blank or comment..
|
||
if ( $type ne 'b' && $type ne '#' ) {
|
||
|
||
# and we are in a BLOCK
|
||
if ($nesting_block_flag) {
|
||
|
||
# the next token after a ';' and label starts a new stmt
|
||
if ( $type eq ';' || $type eq 'J' ) {
|
||
$in_statement_continuation = 0;
|
||
}
|
||
|
||
# otherwise, we are continuing the current statement
|
||
else {
|
||
$in_statement_continuation = 1;
|
||
}
|
||
}
|
||
|
||
# if we are not in a BLOCK..
|
||
else {
|
||
|
||
# do not use continuation indentation if not list
|
||
# environment (could be within if/elsif clause)
|
||
if ( !$nesting_list_flag ) {
|
||
$in_statement_continuation = 0;
|
||
}
|
||
|
||
# otherwise, the token after a ',' starts a new term
|
||
|
||
# Patch FOR RT#99961; no continuation after a ';'
|
||
# This is needed because perltidy currently marks
|
||
# a block preceded by a type character like % or @
|
||
# as a non block, to simplify formatting. But these
|
||
# are actually blocks and can have semicolons.
|
||
# See code_block_type() and is_non_structural_brace().
|
||
elsif ( $type eq ',' || $type eq ';' ) {
|
||
$in_statement_continuation = 0;
|
||
}
|
||
|
||
# otherwise, we are continuing the current term
|
||
else {
|
||
$in_statement_continuation = 1;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
if ( $level_in_tokenizer < 0 ) {
|
||
unless ( $tokenizer_self->[_saw_negative_indentation_] ) {
|
||
$tokenizer_self->[_saw_negative_indentation_] = 1;
|
||
warning("Starting negative indentation\n");
|
||
}
|
||
}
|
||
|
||
# set secondary nesting levels based on all containment token types
|
||
# Note: these are set so that the nesting depth is the depth
|
||
# of the PREVIOUS TOKEN, which is convenient for setting
|
||
# the strength of token bonds
|
||
my $slevel_i = $slevel_in_tokenizer;
|
||
|
||
# /^[L\{\(\[]$/
|
||
if ( $is_opening_type{$type} ) {
|
||
$slevel_in_tokenizer++;
|
||
$nesting_token_string .= $tok;
|
||
$nesting_type_string .= $type;
|
||
}
|
||
|
||
# /^[R\}\)\]]$/
|
||
elsif ( $is_closing_type{$type} ) {
|
||
$slevel_in_tokenizer--;
|
||
my $char = chop $nesting_token_string;
|
||
|
||
if ( $char ne $matching_start_token{$tok} ) {
|
||
$nesting_token_string .= $char . $tok;
|
||
$nesting_type_string .= $type;
|
||
}
|
||
else {
|
||
chop $nesting_type_string;
|
||
}
|
||
}
|
||
|
||
push( @block_type, $routput_block_type->[$i] );
|
||
push( @ci_string, $ci_string_i );
|
||
push( @container_environment, $container_environment );
|
||
push( @container_type, $routput_container_type->[$i] );
|
||
push( @levels, $level_i );
|
||
push( @nesting_tokens, $nesting_token_string_i );
|
||
push( @nesting_types, $nesting_type_string_i );
|
||
push( @slevels, $slevel_i );
|
||
push( @token_type, $fix_type );
|
||
push( @type_sequence, $routput_type_sequence->[$i] );
|
||
push( @nesting_blocks, $nesting_block_string );
|
||
push( @nesting_lists, $nesting_list_string );
|
||
|
||
# now form the previous token
|
||
if ( $im >= 0 ) {
|
||
$num =
|
||
$rtoken_map->[$i] - $rtoken_map->[$im]; # how many characters
|
||
|
||
if ( $num > 0 ) {
|
||
push( @tokens,
|
||
substr( $input_line, $rtoken_map->[$im], $num ) );
|
||
}
|
||
}
|
||
$im = $i;
|
||
}
|
||
|
||
$num = length($input_line) - $rtoken_map->[$im]; # make the last token
|
||
if ( $num > 0 ) {
|
||
push( @tokens, substr( $input_line, $rtoken_map->[$im], $num ) );
|
||
}
|
||
|
||
$tokenizer_self->[_in_attribute_list_] = $in_attribute_list;
|
||
$tokenizer_self->[_in_quote_] = $in_quote;
|
||
$tokenizer_self->[_quote_target_] =
|
||
$in_quote ? matching_end_token($quote_character) : "";
|
||
$tokenizer_self->[_rhere_target_list_] = $rhere_target_list;
|
||
|
||
$line_of_tokens->{_rtoken_type} = \@token_type;
|
||
$line_of_tokens->{_rtokens} = \@tokens;
|
||
$line_of_tokens->{_rblock_type} = \@block_type;
|
||
$line_of_tokens->{_rcontainer_type} = \@container_type;
|
||
$line_of_tokens->{_rcontainer_environment} = \@container_environment;
|
||
$line_of_tokens->{_rtype_sequence} = \@type_sequence;
|
||
$line_of_tokens->{_rlevels} = \@levels;
|
||
$line_of_tokens->{_rslevels} = \@slevels;
|
||
$line_of_tokens->{_rnesting_tokens} = \@nesting_tokens;
|
||
$line_of_tokens->{_rci_levels} = \@ci_string;
|
||
$line_of_tokens->{_rnesting_blocks} = \@nesting_blocks;
|
||
|
||
return;
|
||
}
|
||
} # end tokenize_this_line
|
||
|
||
#########i#############################################################
|
||
# Tokenizer routines which assist in identifying token types
|
||
#######################################################################
|
||
|
||
# hash lookup table of operator expected values
|
||
my %op_expected_table;
|
||
|
||
BEGIN {
|
||
|
||
# Always expecting TERM following these types:
|
||
# note: this is identical to '@value_requestor_type' defined later.
|
||
my @q = qw(
|
||
; ! + x & ? F J - p / Y : % f U ~ A G j L * . | ^ < = [ m { \ > t
|
||
|| >= != mm *= => .. !~ == && |= .= pp -= =~ += <= %= ^= x= ~~ ** << /=
|
||
&= // >> ~. &. |. ^.
|
||
... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~
|
||
);
|
||
push @q, ',';
|
||
push @q, '('; # for completeness, not currently a token type
|
||
@{op_expected_table}{@q} = (TERM) x scalar(@q);
|
||
|
||
# Always UNKNOWN following these types:
|
||
@q = qw( w );
|
||
@{op_expected_table}{@q} = (UNKNOWN) x scalar(@q);
|
||
|
||
# Always expecting OPERATOR ...
|
||
# 'n' and 'v' are currently excluded because they might be VERSION numbers
|
||
# 'i' is currently excluded because it might be a package
|
||
# 'q' is currently excluded because it might be a prototype
|
||
@q = qw( -- C -> h R ++ ] Q <> ); ## n v q i );
|
||
push @q, ')';
|
||
@{op_expected_table}{@q} = (OPERATOR) x scalar(@q);
|
||
|
||
}
|
||
|
||
sub operator_expected {
|
||
|
||
# Returns a parameter indicating what types of tokens can occur next
|
||
|
||
# Call format:
|
||
# $op_expected = operator_expected( [ $prev_type, $tok, $next_type ] );
|
||
# where
|
||
# $prev_type is the type of the previous token (blank or not)
|
||
# $tok is the current token
|
||
# $next_type is the type of the next token (blank or not)
|
||
|
||
# Many perl symbols have two or more meanings. For example, '<<'
|
||
# can be a shift operator or a here-doc operator. The
|
||
# interpretation of these symbols depends on the current state of
|
||
# the tokenizer, which may either be expecting a term or an
|
||
# operator. For this example, a << would be a shift if an OPERATOR
|
||
# is expected, and a here-doc if a TERM is expected. This routine
|
||
# is called to make this decision for any current token. It returns
|
||
# one of three possible values:
|
||
#
|
||
# OPERATOR - operator expected (or at least, not a term)
|
||
# UNKNOWN - can't tell
|
||
# TERM - a term is expected (or at least, not an operator)
|
||
#
|
||
# The decision is based on what has been seen so far. This
|
||
# information is stored in the "$last_nonblank_type" and
|
||
# "$last_nonblank_token" variables. For example, if the
|
||
# $last_nonblank_type is '=~', then we are expecting a TERM, whereas
|
||
# if $last_nonblank_type is 'n' (numeric), we are expecting an
|
||
# OPERATOR.
|
||
#
|
||
# If a UNKNOWN is returned, the calling routine must guess. A major
|
||
# goal of this tokenizer is to minimize the possibility of returning
|
||
# UNKNOWN, because a wrong guess can spoil the formatting of a
|
||
# script.
|
||
#
|
||
# Adding NEW_TOKENS: it is critically important that this routine be
|
||
# updated to allow it to determine if an operator or term is to be
|
||
# expected after the new token. Doing this simply involves adding
|
||
# the new token character to one of the regexes in this routine or
|
||
# to one of the hash lists
|
||
# that it uses, which are initialized in the BEGIN section.
|
||
# USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token,
|
||
# $statement_type
|
||
|
||
# When possible, token types should be selected such that we can determine
|
||
# the 'operator_expected' value by a simple hash lookup. If there are
|
||
# exceptions, that is an indication that a new type is needed.
|
||
|
||
my ($rarg) = @_;
|
||
|
||
##############
|
||
# Table lookup
|
||
##############
|
||
|
||
# Many types are can be obtained by a table lookup given the previous type.
|
||
# This typically handles half or more of the calls.
|
||
my $op_expected = $op_expected_table{$last_nonblank_type};
|
||
goto RETURN if ( defined($op_expected) );
|
||
|
||
######################
|
||
# Handle special cases
|
||
######################
|
||
|
||
$op_expected = UNKNOWN;
|
||
my ( $prev_type, $tok, $next_type ) = @{$rarg};
|
||
|
||
# Types 'k', '}' and 'Z' depend on context
|
||
# FIXME: Types 'i', 'n', 'v', 'q' currently also temporarily depend on
|
||
# context but that dependence could eventually be eliminated with better
|
||
# token type definition
|
||
|
||
# identifier...
|
||
if ( $last_nonblank_type eq 'i' ) {
|
||
$op_expected = OPERATOR;
|
||
|
||
# FIXME: it would be cleaner to make this a special type
|
||
# expecting VERSION or {} after package NAMESPACE
|
||
# TODO: maybe mark these words as type 'Y'?
|
||
if ( $statement_type =~ /^package\b/
|
||
&& $last_nonblank_token =~ /^package\b/ )
|
||
{
|
||
$op_expected = TERM;
|
||
}
|
||
}
|
||
|
||
# keyword...
|
||
elsif ( $last_nonblank_type eq 'k' ) {
|
||
$op_expected = TERM;
|
||
if ( $expecting_operator_token{$last_nonblank_token} ) {
|
||
$op_expected = OPERATOR;
|
||
}
|
||
elsif ( $expecting_term_token{$last_nonblank_token} ) {
|
||
|
||
# Exceptions from TERM:
|
||
|
||
# // may follow perl functions which may be unary operators
|
||
# see test file dor.t (defined or);
|
||
if ( $tok eq '/'
|
||
&& $next_type eq '/'
|
||
&& $is_keyword_rejecting_slash_as_pattern_delimiter{
|
||
$last_nonblank_token} )
|
||
{
|
||
$op_expected = OPERATOR;
|
||
}
|
||
|
||
# Patch to allow a ? following 'split' to be a depricated pattern
|
||
# delimiter. This patch is coordinated with the omission of split
|
||
# from the list
|
||
# %is_keyword_rejecting_question_as_pattern_delimiter. This patch
|
||
# will force perltidy to guess.
|
||
elsif ($tok eq '?'
|
||
&& $last_nonblank_token eq 'split' )
|
||
{
|
||
$op_expected = UNKNOWN;
|
||
}
|
||
}
|
||
} ## end type 'k'
|
||
|
||
# closing container token...
|
||
|
||
# Note that the actual token for type '}' may also be a ')'.
|
||
|
||
# Also note that $last_nonblank_token is not the token corresponding to
|
||
# $last_nonblank_type when the type is a closing container. In that
|
||
# case it is the token before the corresponding opening container token.
|
||
# So for example, for this snippet
|
||
# $a = do { BLOCK } / 2;
|
||
# the $last_nonblank_token is 'do' when $last_nonblank_type eq '}'.
|
||
|
||
elsif ( $last_nonblank_type eq '}' ) {
|
||
$op_expected = UNKNOWN;
|
||
|
||
# handle something after 'do' and 'eval'
|
||
if ( $is_block_operator{$last_nonblank_token} ) {
|
||
|
||
# something like $a = do { BLOCK } / 2;
|
||
$op_expected = OPERATOR; # block mode following }
|
||
}
|
||
|
||
elsif ( $last_nonblank_token =~ /^(\)|\$|\-\>)/ ) {
|
||
$op_expected = OPERATOR;
|
||
if ( $last_nonblank_token eq '$' ) { $op_expected = UNKNOWN }
|
||
|
||
}
|
||
|
||
# Check for smartmatch operator before preceding brace or square
|
||
# bracket. For example, at the ? after the ] in the following
|
||
# expressions we are expecting an operator:
|
||
#
|
||
# qr/3/ ~~ ['1234'] ? 1 : 0;
|
||
# map { $_ ~~ [ '0', '1' ] ? 'x' : 'o' } @a;
|
||
elsif ( $last_nonblank_token eq '~~' ) {
|
||
$op_expected = OPERATOR;
|
||
}
|
||
|
||
# A right brace here indicates the end of a simple block. All
|
||
# non-structural right braces have type 'R' all braces associated with
|
||
# block operator keywords have been given those keywords as
|
||
# "last_nonblank_token" and caught above. (This statement is order
|
||
# dependent, and must come after checking $last_nonblank_token).
|
||
else {
|
||
|
||
# patch for dor.t (defined or).
|
||
if ( $tok eq '/'
|
||
&& $next_type eq '/'
|
||
&& $last_nonblank_token eq ']' )
|
||
{
|
||
$op_expected = OPERATOR;
|
||
}
|
||
|
||
# Patch for RT #116344: misparse a ternary operator after an
|
||
# anonymous hash, like this:
|
||
# return ref {} ? 1 : 0;
|
||
# The right brace should really be marked type 'R' in this case,
|
||
# and it is safest to return an UNKNOWN here. Expecting a TERM will
|
||
# cause the '?' to always be interpreted as a pattern delimiter
|
||
# rather than introducing a ternary operator.
|
||
elsif ( $tok eq '?' ) {
|
||
$op_expected = UNKNOWN;
|
||
}
|
||
else {
|
||
$op_expected = TERM;
|
||
}
|
||
}
|
||
} ## end type '}'
|
||
|
||
# number or v-string...
|
||
# An exception is for VERSION numbers a 'use' statement. It has the format
|
||
# use Module VERSION LIST
|
||
# We could avoid this exception by writing a special sub to parse 'use'
|
||
# statements and perhaps mark these numbers with a new type V (for VERSION)
|
||
elsif ( $last_nonblank_type =~ /^[nv]$/ ) {
|
||
$op_expected = OPERATOR;
|
||
if ( $statement_type eq 'use' ) {
|
||
$op_expected = UNKNOWN;
|
||
}
|
||
}
|
||
|
||
# quote...
|
||
# FIXME: labeled prototype words should probably be given type 'A' or maybe
|
||
# 'J'; not 'q'; or maybe mark as type 'Y'
|
||
elsif ( $last_nonblank_type eq 'q' ) {
|
||
$op_expected = OPERATOR;
|
||
if ( $last_nonblank_token eq 'prototype' )
|
||
##|| $last_nonblank_token eq 'switch' )
|
||
{
|
||
$op_expected = TERM;
|
||
}
|
||
}
|
||
|
||
# file handle or similar
|
||
elsif ( $last_nonblank_type eq 'Z' ) {
|
||
|
||
$op_expected = UNKNOWN;
|
||
|
||
# angle.t
|
||
if ( $last_nonblank_token =~ /^\w/ ) {
|
||
$op_expected = UNKNOWN;
|
||
}
|
||
|
||
# The 'weird parsing rules' of next section do not work for '<' and '?'
|
||
# It is best to mark them as unknown. Test case:
|
||
# print $fh <DATA>;
|
||
elsif ( $tok =~ /^[\<\?]$/ ) {
|
||
$op_expected = UNKNOWN;
|
||
}
|
||
|
||
# For possible file handle like "$a", Perl uses weird parsing rules.
|
||
# For example:
|
||
# print $a/2,"/hi"; - division
|
||
# print $a / 2,"/hi"; - division
|
||
# print $a/ 2,"/hi"; - division
|
||
# print $a /2,"/hi"; - pattern (and error)!
|
||
# Some examples where this logic works okay, for '&','*','+':
|
||
# print $fh &xsi_protos(@mods);
|
||
# my $x = new $CompressClass *FH;
|
||
# print $OUT +( $count % 15 ? ", " : "\n\t" );
|
||
elsif ($prev_type eq 'b'
|
||
&& $next_type ne 'b' )
|
||
{
|
||
$op_expected = TERM;
|
||
}
|
||
|
||
# Note that '?' and '<' have been moved above
|
||
# ( $tok =~ /^([x\/\+\-\*\%\&\.\?\<]|\>\>)$/ ) {
|
||
elsif ( $tok =~ /^([x\/\+\-\*\%\&\.]|\>\>)$/ ) {
|
||
|
||
# Do not complain in 'use' statements, which have special syntax.
|
||
# For example, from RT#130344:
|
||
# use lib $FindBin::Bin . '/lib';
|
||
if ( $statement_type ne 'use' ) {
|
||
complain("operator in print statement not recommended\n");
|
||
}
|
||
$op_expected = OPERATOR;
|
||
}
|
||
}
|
||
|
||
# anything else...
|
||
else {
|
||
$op_expected = UNKNOWN;
|
||
}
|
||
|
||
RETURN:
|
||
|
||
# debug and diagnostics can go here..
|
||
|
||
0 && do {
|
||
print STDOUT
|
||
"EXPECT: returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n";
|
||
};
|
||
|
||
return $op_expected;
|
||
|
||
} ## end of sub operator_expected
|
||
|
||
sub new_statement_ok {
|
||
|
||
# return true if the current token can start a new statement
|
||
# USES GLOBAL VARIABLES: $last_nonblank_type
|
||
|
||
return label_ok() # a label would be ok here
|
||
|
||
|| $last_nonblank_type eq 'J'; # or we follow a label
|
||
|
||
}
|
||
|
||
sub label_ok {
|
||
|
||
# Decide if a bare word followed by a colon here is a label
|
||
# USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
|
||
# $brace_depth, @brace_type
|
||
|
||
# if it follows an opening or closing code block curly brace..
|
||
if ( ( $last_nonblank_token eq '{' || $last_nonblank_token eq '}' )
|
||
&& $last_nonblank_type eq $last_nonblank_token )
|
||
{
|
||
|
||
# it is a label if and only if the curly encloses a code block
|
||
return $brace_type[$brace_depth];
|
||
}
|
||
|
||
# otherwise, it is a label if and only if it follows a ';' (real or fake)
|
||
# or another label
|
||
else {
|
||
return ( $last_nonblank_type eq ';' || $last_nonblank_type eq 'J' );
|
||
}
|
||
}
|
||
|
||
sub code_block_type {
|
||
|
||
# Decide if this is a block of code, and its type.
|
||
# Must be called only when $type = $token = '{'
|
||
# The problem is to distinguish between the start of a block of code
|
||
# and the start of an anonymous hash reference
|
||
# Returns "" if not code block, otherwise returns 'last_nonblank_token'
|
||
# to indicate the type of code block. (For example, 'last_nonblank_token'
|
||
# might be 'if' for an if block, 'else' for an else block, etc).
|
||
# USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
|
||
# $last_nonblank_block_type, $brace_depth, @brace_type
|
||
|
||
# handle case of multiple '{'s
|
||
|
||
# print "BLOCK_TYPE EXAMINING: type=$last_nonblank_type tok=$last_nonblank_token\n";
|
||
|
||
my ( $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
|
||
if ( $last_nonblank_token eq '{'
|
||
&& $last_nonblank_type eq $last_nonblank_token )
|
||
{
|
||
|
||
# opening brace where a statement may appear is probably
|
||
# a code block but might be and anonymous hash reference
|
||
if ( $brace_type[$brace_depth] ) {
|
||
return decide_if_code_block( $i, $rtokens, $rtoken_type,
|
||
$max_token_index );
|
||
}
|
||
|
||
# cannot start a code block within an anonymous hash
|
||
else {
|
||
return "";
|
||
}
|
||
}
|
||
|
||
elsif ( $last_nonblank_token eq ';' ) {
|
||
|
||
# an opening brace where a statement may appear is probably
|
||
# a code block but might be and anonymous hash reference
|
||
return decide_if_code_block( $i, $rtokens, $rtoken_type,
|
||
$max_token_index );
|
||
}
|
||
|
||
# handle case of '}{'
|
||
elsif ($last_nonblank_token eq '}'
|
||
&& $last_nonblank_type eq $last_nonblank_token )
|
||
{
|
||
|
||
# a } { situation ...
|
||
# could be hash reference after code block..(blktype1.t)
|
||
if ($last_nonblank_block_type) {
|
||
return decide_if_code_block( $i, $rtokens, $rtoken_type,
|
||
$max_token_index );
|
||
}
|
||
|
||
# must be a block if it follows a closing hash reference
|
||
else {
|
||
return $last_nonblank_token;
|
||
}
|
||
}
|
||
|
||
################################################################
|
||
# NOTE: braces after type characters start code blocks, but for
|
||
# simplicity these are not identified as such. See also
|
||
# sub is_non_structural_brace.
|
||
################################################################
|
||
|
||
## elsif ( $last_nonblank_type eq 't' ) {
|
||
## return $last_nonblank_token;
|
||
## }
|
||
|
||
# brace after label:
|
||
elsif ( $last_nonblank_type eq 'J' ) {
|
||
return $last_nonblank_token;
|
||
}
|
||
|
||
# otherwise, look at previous token. This must be a code block if
|
||
# it follows any of these:
|
||
# /^(BEGIN|END|CHECK|INIT|AUTOLOAD|DESTROY|UNITCHECK|continue|if|elsif|else|unless|do|while|until|eval|for|foreach|map|grep|sort)$/
|
||
elsif ( $is_code_block_token{$last_nonblank_token} ) {
|
||
|
||
# Bug Patch: Note that the opening brace after the 'if' in the following
|
||
# snippet is an anonymous hash ref and not a code block!
|
||
# print 'hi' if { x => 1, }->{x};
|
||
# We can identify this situation because the last nonblank type
|
||
# will be a keyword (instead of a closing peren)
|
||
if ( $last_nonblank_token =~ /^(if|unless)$/
|
||
&& $last_nonblank_type eq 'k' )
|
||
{
|
||
return "";
|
||
}
|
||
else {
|
||
return $last_nonblank_token;
|
||
}
|
||
}
|
||
|
||
# or a sub or package BLOCK
|
||
elsif ( ( $last_nonblank_type eq 'i' || $last_nonblank_type eq 't' )
|
||
&& $last_nonblank_token =~ /^(sub|package)\b/ )
|
||
{
|
||
return $last_nonblank_token;
|
||
}
|
||
|
||
# or a sub alias
|
||
elsif (( $last_nonblank_type eq 'i' || $last_nonblank_type eq 't' )
|
||
&& ( $is_sub{$last_nonblank_token} ) )
|
||
{
|
||
return 'sub';
|
||
}
|
||
|
||
elsif ( $statement_type =~ /^(sub|package)\b/ ) {
|
||
return $statement_type;
|
||
}
|
||
|
||
# user-defined subs with block parameters (like grep/map/eval)
|
||
elsif ( $last_nonblank_type eq 'G' ) {
|
||
return $last_nonblank_token;
|
||
}
|
||
|
||
# check bareword
|
||
elsif ( $last_nonblank_type eq 'w' ) {
|
||
return decide_if_code_block( $i, $rtokens, $rtoken_type,
|
||
$max_token_index );
|
||
}
|
||
|
||
# Patch for bug # RT #94338 reported by Daniel Trizen
|
||
# for-loop in a parenthesized block-map triggering an error message:
|
||
# map( { foreach my $item ( '0', '1' ) { print $item} } qw(a b c) );
|
||
# Check for a code block within a parenthesized function call
|
||
elsif ( $last_nonblank_token eq '(' ) {
|
||
my $paren_type = $paren_type[$paren_depth];
|
||
if ( $paren_type && $paren_type =~ /^(map|grep|sort)$/ ) {
|
||
|
||
# We will mark this as a code block but use type 't' instead
|
||
# of the name of the contining function. This will allow for
|
||
# correct parsing but will usually produce better formatting.
|
||
# Braces with block type 't' are not broken open automatically
|
||
# in the formatter as are other code block types, and this usually
|
||
# works best.
|
||
return 't'; # (Not $paren_type)
|
||
}
|
||
else {
|
||
return "";
|
||
}
|
||
}
|
||
|
||
# handle unknown syntax ') {'
|
||
# we previously appended a '()' to mark this case
|
||
elsif ( $last_nonblank_token =~ /\(\)$/ ) {
|
||
return $last_nonblank_token;
|
||
}
|
||
|
||
# anything else must be anonymous hash reference
|
||
else {
|
||
return "";
|
||
}
|
||
}
|
||
|
||
sub decide_if_code_block {
|
||
|
||
# USES GLOBAL VARIABLES: $last_nonblank_token
|
||
my ( $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
|
||
|
||
my ( $next_nonblank_token, $i_next ) =
|
||
find_next_nonblank_token( $i, $rtokens, $max_token_index );
|
||
|
||
# we are at a '{' where a statement may appear.
|
||
# We must decide if this brace starts an anonymous hash or a code
|
||
# block.
|
||
# return "" if anonymous hash, and $last_nonblank_token otherwise
|
||
|
||
# initialize to be code BLOCK
|
||
my $code_block_type = $last_nonblank_token;
|
||
|
||
# Check for the common case of an empty anonymous hash reference:
|
||
# Maybe something like sub { { } }
|
||
if ( $next_nonblank_token eq '}' ) {
|
||
$code_block_type = "";
|
||
}
|
||
|
||
else {
|
||
|
||
# To guess if this '{' is an anonymous hash reference, look ahead
|
||
# and test as follows:
|
||
#
|
||
# it is a hash reference if next come:
|
||
# - a string or digit followed by a comma or =>
|
||
# - bareword followed by =>
|
||
# otherwise it is a code block
|
||
#
|
||
# Examples of anonymous hash ref:
|
||
# {'aa',};
|
||
# {1,2}
|
||
#
|
||
# Examples of code blocks:
|
||
# {1; print "hello\n", 1;}
|
||
# {$a,1};
|
||
|
||
# We are only going to look ahead one more (nonblank/comment) line.
|
||
# Strange formatting could cause a bad guess, but that's unlikely.
|
||
my @pre_types;
|
||
my @pre_tokens;
|
||
|
||
# Ignore the rest of this line if it is a side comment
|
||
if ( $next_nonblank_token ne '#' ) {
|
||
@pre_types = @{$rtoken_type}[ $i + 1 .. $max_token_index ];
|
||
@pre_tokens = @{$rtokens}[ $i + 1 .. $max_token_index ];
|
||
}
|
||
my ( $rpre_tokens, $rpre_types ) =
|
||
peek_ahead_for_n_nonblank_pre_tokens(20); # 20 is arbitrary but
|
||
# generous, and prevents
|
||
# wasting lots of
|
||
# time in mangled files
|
||
if ( defined($rpre_types) && @{$rpre_types} ) {
|
||
push @pre_types, @{$rpre_types};
|
||
push @pre_tokens, @{$rpre_tokens};
|
||
}
|
||
|
||
# put a sentinel token to simplify stopping the search
|
||
push @pre_types, '}';
|
||
push @pre_types, '}';
|
||
|
||
my $jbeg = 0;
|
||
$jbeg = 1 if $pre_types[0] eq 'b';
|
||
|
||
# first look for one of these
|
||
# - bareword
|
||
# - bareword with leading -
|
||
# - digit
|
||
# - quoted string
|
||
my $j = $jbeg;
|
||
if ( $pre_types[$j] =~ /^[\'\"]/ ) {
|
||
|
||
# find the closing quote; don't worry about escapes
|
||
my $quote_mark = $pre_types[$j];
|
||
foreach my $k ( $j + 1 .. @pre_types - 2 ) {
|
||
if ( $pre_types[$k] eq $quote_mark ) {
|
||
$j = $k + 1;
|
||
my $next = $pre_types[$j];
|
||
last;
|
||
}
|
||
}
|
||
}
|
||
elsif ( $pre_types[$j] eq 'd' ) {
|
||
$j++;
|
||
}
|
||
elsif ( $pre_types[$j] eq 'w' ) {
|
||
$j++;
|
||
}
|
||
elsif ( $pre_types[$j] eq '-' && $pre_types[ ++$j ] eq 'w' ) {
|
||
$j++;
|
||
}
|
||
if ( $j > $jbeg ) {
|
||
|
||
$j++ if $pre_types[$j] eq 'b';
|
||
|
||
# Patched for RT #95708
|
||
if (
|
||
|
||
# it is a comma which is not a pattern delimeter except for qw
|
||
(
|
||
$pre_types[$j] eq ','
|
||
&& $pre_tokens[$jbeg] !~ /^(s|m|y|tr|qr|q|qq|qx)$/
|
||
)
|
||
|
||
# or a =>
|
||
|| ( $pre_types[$j] eq '=' && $pre_types[ ++$j ] eq '>' )
|
||
)
|
||
{
|
||
$code_block_type = "";
|
||
}
|
||
}
|
||
}
|
||
|
||
return $code_block_type;
|
||
}
|
||
|
||
sub report_unexpected {
|
||
|
||
# report unexpected token type and show where it is
|
||
# USES GLOBAL VARIABLES: $tokenizer_self
|
||
my ( $found, $expecting, $i_tok, $last_nonblank_i, $rpretoken_map,
|
||
$rpretoken_type, $input_line )
|
||
= @_;
|
||
|
||
if ( ++$tokenizer_self->[_unexpected_error_count_] <= MAX_NAG_MESSAGES ) {
|
||
my $msg = "found $found where $expecting expected";
|
||
my $pos = $rpretoken_map->[$i_tok];
|
||
interrupt_logfile();
|
||
my $input_line_number = $tokenizer_self->[_last_line_number_];
|
||
my ( $offset, $numbered_line, $underline ) =
|
||
make_numbered_line( $input_line_number, $input_line, $pos );
|
||
$underline = write_on_underline( $underline, $pos - $offset, '^' );
|
||
|
||
my $trailer = "";
|
||
if ( ( $i_tok > 0 ) && ( $last_nonblank_i >= 0 ) ) {
|
||
my $pos_prev = $rpretoken_map->[$last_nonblank_i];
|
||
my $num;
|
||
if ( $rpretoken_type->[ $i_tok - 1 ] eq 'b' ) {
|
||
$num = $rpretoken_map->[ $i_tok - 1 ] - $pos_prev;
|
||
}
|
||
else {
|
||
$num = $pos - $pos_prev;
|
||
}
|
||
if ( $num > 40 ) { $num = 40; $pos_prev = $pos - 40; }
|
||
|
||
$underline =
|
||
write_on_underline( $underline, $pos_prev - $offset, '-' x $num );
|
||
$trailer = " (previous token underlined)";
|
||
}
|
||
$underline =~ s/\s+$//;
|
||
warning( $numbered_line . "\n" );
|
||
warning( $underline . "\n" );
|
||
warning( $msg . $trailer . "\n" );
|
||
resume_logfile();
|
||
}
|
||
return;
|
||
}
|
||
|
||
sub is_non_structural_brace {
|
||
|
||
# Decide if a brace or bracket is structural or non-structural
|
||
# by looking at the previous token and type
|
||
# USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token
|
||
|
||
# EXPERIMENTAL: Mark slices as structural; idea was to improve formatting.
|
||
# Tentatively deactivated because it caused the wrong operator expectation
|
||
# for this code:
|
||
# $user = @vars[1] / 100;
|
||
# Must update sub operator_expected before re-implementing.
|
||
# if ( $last_nonblank_type eq 'i' && $last_nonblank_token =~ /^@/ ) {
|
||
# return 0;
|
||
# }
|
||
|
||
################################################################
|
||
# NOTE: braces after type characters start code blocks, but for
|
||
# simplicity these are not identified as such. See also
|
||
# sub code_block_type
|
||
################################################################
|
||
|
||
##if ($last_nonblank_type eq 't') {return 0}
|
||
|
||
# otherwise, it is non-structural if it is decorated
|
||
# by type information.
|
||
# For example, the '{' here is non-structural: ${xxx}
|
||
return (
|
||
$last_nonblank_token =~ /^([\$\@\*\&\%\)]|->|::)/
|
||
|
||
# or if we follow a hash or array closing curly brace or bracket
|
||
# For example, the second '{' in this is non-structural: $a{'x'}{'y'}
|
||
# because the first '}' would have been given type 'R'
|
||
|| $last_nonblank_type =~ /^([R\]])$/
|
||
);
|
||
}
|
||
|
||
#########i#############################################################
|
||
# Tokenizer routines for tracking container nesting depths
|
||
#######################################################################
|
||
|
||
# The following routines keep track of nesting depths of the nesting
|
||
# types, ( [ { and ?. This is necessary for determining the indentation
|
||
# level, and also for debugging programs. Not only do they keep track of
|
||
# nesting depths of the individual brace types, but they check that each
|
||
# of the other brace types is balanced within matching pairs. For
|
||
# example, if the program sees this sequence:
|
||
#
|
||
# { ( ( ) }
|
||
#
|
||
# then it can determine that there is an extra left paren somewhere
|
||
# between the { and the }. And so on with every other possible
|
||
# combination of outer and inner brace types. For another
|
||
# example:
|
||
#
|
||
# ( [ ..... ] ] )
|
||
#
|
||
# which has an extra ] within the parens.
|
||
#
|
||
# The brace types have indexes 0 .. 3 which are indexes into
|
||
# the matrices.
|
||
#
|
||
# The pair ? : are treated as just another nesting type, with ? acting
|
||
# as the opening brace and : acting as the closing brace.
|
||
#
|
||
# The matrix
|
||
#
|
||
# $depth_array[$a][$b][ $current_depth[$a] ] = $current_depth[$b];
|
||
#
|
||
# saves the nesting depth of brace type $b (where $b is either of the other
|
||
# nesting types) when brace type $a enters a new depth. When this depth
|
||
# decreases, a check is made that the current depth of brace types $b is
|
||
# unchanged, or otherwise there must have been an error. This can
|
||
# be very useful for localizing errors, particularly when perl runs to
|
||
# the end of a large file (such as this one) and announces that there
|
||
# is a problem somewhere.
|
||
#
|
||
# A numerical sequence number is maintained for every nesting type,
|
||
# so that each matching pair can be uniquely identified in a simple
|
||
# way.
|
||
|
||
sub increase_nesting_depth {
|
||
my ( $aa, $pos ) = @_;
|
||
|
||
# USES GLOBAL VARIABLES: $tokenizer_self, @current_depth,
|
||
# @current_sequence_number, @depth_array, @starting_line_of_current_depth,
|
||
# $statement_type
|
||
$current_depth[$aa]++;
|
||
$total_depth++;
|
||
$total_depth[$aa][ $current_depth[$aa] ] = $total_depth;
|
||
my $input_line_number = $tokenizer_self->[_last_line_number_];
|
||
my $input_line = $tokenizer_self->[_line_of_text_];
|
||
|
||
# Sequence numbers increment by number of items. This keeps
|
||
# a unique set of numbers but still allows the relative location
|
||
# of any type to be determined.
|
||
$nesting_sequence_number[$aa] += scalar(@closing_brace_names);
|
||
my $seqno = $nesting_sequence_number[$aa];
|
||
$current_sequence_number[$aa][ $current_depth[$aa] ] = $seqno;
|
||
|
||
$starting_line_of_current_depth[$aa][ $current_depth[$aa] ] =
|
||
[ $input_line_number, $input_line, $pos ];
|
||
|
||
for my $bb ( 0 .. @closing_brace_names - 1 ) {
|
||
next if ( $bb == $aa );
|
||
$depth_array[$aa][$bb][ $current_depth[$aa] ] = $current_depth[$bb];
|
||
}
|
||
|
||
# set a flag for indenting a nested ternary statement
|
||
my $indent = 0;
|
||
if ( $aa == QUESTION_COLON ) {
|
||
$nested_ternary_flag[ $current_depth[$aa] ] = 0;
|
||
if ( $current_depth[$aa] > 1 ) {
|
||
if ( $nested_ternary_flag[ $current_depth[$aa] - 1 ] == 0 ) {
|
||
my $pdepth = $total_depth[$aa][ $current_depth[$aa] - 1 ];
|
||
if ( $pdepth == $total_depth - 1 ) {
|
||
$indent = 1;
|
||
$nested_ternary_flag[ $current_depth[$aa] - 1 ] = -1;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
$nested_statement_type[$aa][ $current_depth[$aa] ] = $statement_type;
|
||
$statement_type = "";
|
||
return ( $seqno, $indent );
|
||
}
|
||
|
||
sub is_balanced_closing_container {
|
||
|
||
# Return true if a closing container can go here without error
|
||
# Return false if not
|
||
my ($aa) = @_;
|
||
|
||
# cannot close if there was no opening
|
||
return unless ( $current_depth[$aa] > 0 );
|
||
|
||
# check that any other brace types $bb contained within would be balanced
|
||
for my $bb ( 0 .. @closing_brace_names - 1 ) {
|
||
next if ( $bb == $aa );
|
||
return
|
||
unless ( $depth_array[$aa][$bb][ $current_depth[$aa] ] ==
|
||
$current_depth[$bb] );
|
||
}
|
||
|
||
# OK, everything will be balanced
|
||
return 1;
|
||
}
|
||
|
||
sub decrease_nesting_depth {
|
||
|
||
my ( $aa, $pos ) = @_;
|
||
|
||
# USES GLOBAL VARIABLES: $tokenizer_self, @current_depth,
|
||
# @current_sequence_number, @depth_array, @starting_line_of_current_depth
|
||
# $statement_type
|
||
my $seqno = 0;
|
||
my $input_line_number = $tokenizer_self->[_last_line_number_];
|
||
my $input_line = $tokenizer_self->[_line_of_text_];
|
||
|
||
my $outdent = 0;
|
||
$total_depth--;
|
||
if ( $current_depth[$aa] > 0 ) {
|
||
|
||
# set a flag for un-indenting after seeing a nested ternary statement
|
||
$seqno = $current_sequence_number[$aa][ $current_depth[$aa] ];
|
||
if ( $aa == QUESTION_COLON ) {
|
||
$outdent = $nested_ternary_flag[ $current_depth[$aa] ];
|
||
}
|
||
$statement_type = $nested_statement_type[$aa][ $current_depth[$aa] ];
|
||
|
||
# check that any brace types $bb contained within are balanced
|
||
for my $bb ( 0 .. @closing_brace_names - 1 ) {
|
||
next if ( $bb == $aa );
|
||
|
||
unless ( $depth_array[$aa][$bb][ $current_depth[$aa] ] ==
|
||
$current_depth[$bb] )
|
||
{
|
||
my $diff =
|
||
$current_depth[$bb] -
|
||
$depth_array[$aa][$bb][ $current_depth[$aa] ];
|
||
|
||
# don't whine too many times
|
||
my $saw_brace_error = get_saw_brace_error();
|
||
if (
|
||
$saw_brace_error <= MAX_NAG_MESSAGES
|
||
|
||
# if too many closing types have occurred, we probably
|
||
# already caught this error
|
||
&& ( ( $diff > 0 ) || ( $saw_brace_error <= 0 ) )
|
||
)
|
||
{
|
||
interrupt_logfile();
|
||
my $rsl =
|
||
$starting_line_of_current_depth[$aa]
|
||
[ $current_depth[$aa] ];
|
||
my $sl = $rsl->[0];
|
||
my $rel = [ $input_line_number, $input_line, $pos ];
|
||
my $el = $rel->[0];
|
||
my ($ess);
|
||
|
||
if ( $diff == 1 || $diff == -1 ) {
|
||
$ess = '';
|
||
}
|
||
else {
|
||
$ess = 's';
|
||
}
|
||
my $bname =
|
||
( $diff > 0 )
|
||
? $opening_brace_names[$bb]
|
||
: $closing_brace_names[$bb];
|
||
write_error_indicator_pair( @{$rsl}, '^' );
|
||
my $msg = <<"EOM";
|
||
Found $diff extra $bname$ess between $opening_brace_names[$aa] on line $sl and $closing_brace_names[$aa] on line $el
|
||
EOM
|
||
|
||
if ( $diff > 0 ) {
|
||
my $rml =
|
||
$starting_line_of_current_depth[$bb]
|
||
[ $current_depth[$bb] ];
|
||
my $ml = $rml->[0];
|
||
$msg .=
|
||
" The most recent un-matched $bname is on line $ml\n";
|
||
write_error_indicator_pair( @{$rml}, '^' );
|
||
}
|
||
write_error_indicator_pair( @{$rel}, '^' );
|
||
warning($msg);
|
||
resume_logfile();
|
||
}
|
||
increment_brace_error();
|
||
}
|
||
}
|
||
$current_depth[$aa]--;
|
||
}
|
||
else {
|
||
|
||
my $saw_brace_error = get_saw_brace_error();
|
||
if ( $saw_brace_error <= MAX_NAG_MESSAGES ) {
|
||
my $msg = <<"EOM";
|
||
There is no previous $opening_brace_names[$aa] to match a $closing_brace_names[$aa] on line $input_line_number
|
||
EOM
|
||
indicate_error( $msg, $input_line_number, $input_line, $pos, '^' );
|
||
}
|
||
increment_brace_error();
|
||
|
||
# keep track of errors in braces alone (ignoring ternary nesting errors)
|
||
$tokenizer_self->[_true_brace_error_count_]++
|
||
if ( $closing_brace_names[$aa] ne "':'" );
|
||
}
|
||
return ( $seqno, $outdent );
|
||
}
|
||
|
||
sub check_final_nesting_depths {
|
||
|
||
# USES GLOBAL VARIABLES: @current_depth, @starting_line_of_current_depth
|
||
|
||
for my $aa ( 0 .. @closing_brace_names - 1 ) {
|
||
|
||
if ( $current_depth[$aa] ) {
|
||
my $rsl =
|
||
$starting_line_of_current_depth[$aa][ $current_depth[$aa] ];
|
||
my $sl = $rsl->[0];
|
||
my $msg = <<"EOM";
|
||
Final nesting depth of $opening_brace_names[$aa]s is $current_depth[$aa]
|
||
The most recent un-matched $opening_brace_names[$aa] is on line $sl
|
||
EOM
|
||
indicate_error( $msg, @{$rsl}, '^' );
|
||
increment_brace_error();
|
||
}
|
||
}
|
||
return;
|
||
}
|
||
|
||
#########i#############################################################
|
||
# Tokenizer routines for looking ahead in input stream
|
||
#######################################################################
|
||
|
||
sub peek_ahead_for_n_nonblank_pre_tokens {
|
||
|
||
# returns next n pretokens if they exist
|
||
# returns undef's if hits eof without seeing any pretokens
|
||
# USES GLOBAL VARIABLES: $tokenizer_self
|
||
my $max_pretokens = shift;
|
||
my $line;
|
||
my $i = 0;
|
||
my ( $rpre_tokens, $rmap, $rpre_types );
|
||
|
||
while ( $line =
|
||
$tokenizer_self->[_line_buffer_object_]->peek_ahead( $i++ ) )
|
||
{
|
||
$line =~ s/^\s*//; # trim leading blanks
|
||
next if ( length($line) <= 0 ); # skip blank
|
||
next if ( $line =~ /^#/ ); # skip comment
|
||
( $rpre_tokens, $rmap, $rpre_types ) =
|
||
pre_tokenize( $line, $max_pretokens );
|
||
last;
|
||
}
|
||
return ( $rpre_tokens, $rpre_types );
|
||
}
|
||
|
||
# look ahead for next non-blank, non-comment line of code
|
||
sub peek_ahead_for_nonblank_token {
|
||
|
||
# USES GLOBAL VARIABLES: $tokenizer_self
|
||
my ( $rtokens, $max_token_index ) = @_;
|
||
my $line;
|
||
my $i = 0;
|
||
|
||
while ( $line =
|
||
$tokenizer_self->[_line_buffer_object_]->peek_ahead( $i++ ) )
|
||
{
|
||
$line =~ s/^\s*//; # trim leading blanks
|
||
next if ( length($line) <= 0 ); # skip blank
|
||
next if ( $line =~ /^#/ ); # skip comment
|
||
my ( $rtok, $rmap, $rtype ) =
|
||
pre_tokenize( $line, 2 ); # only need 2 pre-tokens
|
||
my $j = $max_token_index + 1;
|
||
|
||
foreach my $tok ( @{$rtok} ) {
|
||
last if ( $tok =~ "\n" );
|
||
$rtokens->[ ++$j ] = $tok;
|
||
}
|
||
last;
|
||
}
|
||
return $rtokens;
|
||
}
|
||
|
||
#########i#############################################################
|
||
# Tokenizer guessing routines for ambiguous situations
|
||
#######################################################################
|
||
|
||
sub guess_if_pattern_or_conditional {
|
||
|
||
# this routine is called when we have encountered a ? following an
|
||
# unknown bareword, and we must decide if it starts a pattern or not
|
||
# input parameters:
|
||
# $i - token index of the ? starting possible pattern
|
||
# output parameters:
|
||
# $is_pattern = 0 if probably not pattern, =1 if probably a pattern
|
||
# msg = a warning or diagnostic message
|
||
# USES GLOBAL VARIABLES: $last_nonblank_token
|
||
|
||
# FIXME: this needs to be rewritten
|
||
|
||
my ( $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
|
||
my $is_pattern = 0;
|
||
my $msg = "guessing that ? after $last_nonblank_token starts a ";
|
||
|
||
if ( $i >= $max_token_index ) {
|
||
$msg .= "conditional (no end to pattern found on the line)\n";
|
||
}
|
||
else {
|
||
my $ibeg = $i;
|
||
$i = $ibeg + 1;
|
||
my $next_token = $rtokens->[$i]; # first token after ?
|
||
|
||
# look for a possible ending ? on this line..
|
||
my $in_quote = 1;
|
||
my $quote_depth = 0;
|
||
my $quote_character = '';
|
||
my $quote_pos = 0;
|
||
my $quoted_string;
|
||
(
|
||
$i, $in_quote, $quote_character, $quote_pos, $quote_depth,
|
||
$quoted_string
|
||
)
|
||
= follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
|
||
$quote_pos, $quote_depth, $max_token_index );
|
||
|
||
if ($in_quote) {
|
||
|
||
# we didn't find an ending ? on this line,
|
||
# so we bias towards conditional
|
||
$is_pattern = 0;
|
||
$msg .= "conditional (no ending ? on this line)\n";
|
||
|
||
# we found an ending ?, so we bias towards a pattern
|
||
}
|
||
else {
|
||
|
||
# Watch out for an ending ? in quotes, like this
|
||
# my $case_flag = File::Spec->case_tolerant ? '(?i)' : '';
|
||
my $s_quote = 0;
|
||
my $d_quote = 0;
|
||
my $colons = 0;
|
||
foreach my $ii ( $ibeg + 1 .. $i - 1 ) {
|
||
my $tok = $rtokens->[$ii];
|
||
if ( $tok eq ":" ) { $colons++ }
|
||
if ( $tok eq "'" ) { $s_quote++ }
|
||
if ( $tok eq '"' ) { $d_quote++ }
|
||
}
|
||
if ( $s_quote % 2 || $d_quote % 2 || $colons ) {
|
||
$is_pattern = 0;
|
||
$msg .= "found ending ? but unbalanced quote chars\n";
|
||
}
|
||
elsif ( pattern_expected( $i, $rtokens, $max_token_index ) >= 0 ) {
|
||
$is_pattern = 1;
|
||
$msg .= "pattern (found ending ? and pattern expected)\n";
|
||
}
|
||
else {
|
||
$msg .= "pattern (uncertain, but found ending ?)\n";
|
||
}
|
||
}
|
||
}
|
||
return ( $is_pattern, $msg );
|
||
}
|
||
|
||
sub guess_if_pattern_or_division {
|
||
|
||
# this routine is called when we have encountered a / following an
|
||
# unknown bareword, and we must decide if it starts a pattern or is a
|
||
# division
|
||
# input parameters:
|
||
# $i - token index of the / starting possible pattern
|
||
# output parameters:
|
||
# $is_pattern = 0 if probably division, =1 if probably a pattern
|
||
# msg = a warning or diagnostic message
|
||
# USES GLOBAL VARIABLES: $last_nonblank_token
|
||
my ( $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
|
||
my $is_pattern = 0;
|
||
my $msg = "guessing that / after $last_nonblank_token starts a ";
|
||
|
||
if ( $i >= $max_token_index ) {
|
||
$msg .= "division (no end to pattern found on the line)\n";
|
||
}
|
||
else {
|
||
my $ibeg = $i;
|
||
my $divide_expected =
|
||
numerator_expected( $i, $rtokens, $max_token_index );
|
||
$i = $ibeg + 1;
|
||
my $next_token = $rtokens->[$i]; # first token after slash
|
||
|
||
# One of the things we can look at is the spacing around the slash.
|
||
# There # are four possible spacings around the first slash:
|
||
#
|
||
# return pi/two;#/; -/-
|
||
# return pi/ two;#/; -/+
|
||
# return pi / two;#/; +/+
|
||
# return pi /two;#/; +/- <-- possible pattern
|
||
#
|
||
# Spacing rule: a space before the slash but not after the slash
|
||
# usually indicates a pattern. We can use this to break ties.
|
||
|
||
my $is_pattern_by_spacing =
|
||
( $i > 1 && $next_token ne ' ' && $rtokens->[ $i - 2 ] eq ' ' );
|
||
|
||
# look for a possible ending / on this line..
|
||
my $in_quote = 1;
|
||
my $quote_depth = 0;
|
||
my $quote_character = '';
|
||
my $quote_pos = 0;
|
||
my $quoted_string;
|
||
(
|
||
$i, $in_quote, $quote_character, $quote_pos, $quote_depth,
|
||
$quoted_string
|
||
)
|
||
= follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
|
||
$quote_pos, $quote_depth, $max_token_index );
|
||
|
||
if ($in_quote) {
|
||
|
||
# we didn't find an ending / on this line, so we bias towards
|
||
# division
|
||
if ( $divide_expected >= 0 ) {
|
||
$is_pattern = 0;
|
||
$msg .= "division (no ending / on this line)\n";
|
||
}
|
||
else {
|
||
|
||
# assuming a multi-line pattern ... this is risky, but division
|
||
# does not seem possible. If this fails, it would either be due
|
||
# to a syntax error in the code, or the division_expected logic
|
||
# needs to be fixed.
|
||
$msg = "multi-line pattern (division not possible)\n";
|
||
$is_pattern = 1;
|
||
}
|
||
}
|
||
|
||
# we found an ending /, so we bias slightly towards a pattern
|
||
else {
|
||
|
||
my $pattern_expected =
|
||
pattern_expected( $i, $rtokens, $max_token_index );
|
||
|
||
if ( $pattern_expected >= 0 ) {
|
||
|
||
# pattern looks possible...
|
||
if ( $divide_expected >= 0 ) {
|
||
|
||
# Both pattern and divide can work here...
|
||
|
||
# A very common bare word in math expressions is 'pi'
|
||
if ( $last_nonblank_token eq 'pi' ) {
|
||
$msg .= "division (pattern works too but saw 'pi')\n";
|
||
$is_pattern = 0;
|
||
}
|
||
|
||
# A very common bare word in pattern expressions is 'ok'
|
||
elsif ( $last_nonblank_token eq 'ok' ) {
|
||
$msg .= "pattern (division works too but saw 'ok')\n";
|
||
$is_pattern = 1;
|
||
}
|
||
|
||
# If one rule is more definite, use it
|
||
elsif ( $divide_expected > $pattern_expected ) {
|
||
$msg .=
|
||
"division (more likely based on following tokens)\n";
|
||
$is_pattern = 0;
|
||
}
|
||
|
||
# otherwise, use the spacing rule
|
||
elsif ($is_pattern_by_spacing) {
|
||
$msg .=
|
||
"pattern (guess on spacing, but division possible too)\n";
|
||
$is_pattern = 1;
|
||
}
|
||
else {
|
||
$msg .=
|
||
"division (guess on spacing, but pattern is possible too)\n";
|
||
$is_pattern = 0;
|
||
}
|
||
}
|
||
|
||
# divide_expected < 0 means divide can not work here
|
||
else {
|
||
$is_pattern = 1;
|
||
$msg .= "pattern (division not possible)\n";
|
||
}
|
||
}
|
||
|
||
# pattern does not look possible...
|
||
else {
|
||
|
||
if ( $divide_expected >= 0 ) {
|
||
$is_pattern = 0;
|
||
$msg .= "division (pattern not possible)\n";
|
||
}
|
||
|
||
# Neither pattern nor divide look possible...go by spacing
|
||
else {
|
||
if ($is_pattern_by_spacing) {
|
||
$msg .= "pattern (guess on spacing)\n";
|
||
$is_pattern = 1;
|
||
}
|
||
else {
|
||
$msg .= "division (guess on spacing)\n";
|
||
$is_pattern = 0;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
return ( $is_pattern, $msg );
|
||
}
|
||
|
||
# try to resolve here-doc vs. shift by looking ahead for
|
||
# non-code or the end token (currently only looks for end token)
|
||
# returns 1 if it is probably a here doc, 0 if not
|
||
sub guess_if_here_doc {
|
||
|
||
# This is how many lines we will search for a target as part of the
|
||
# guessing strategy. It is a constant because there is probably
|
||
# little reason to change it.
|
||
# USES GLOBAL VARIABLES: $tokenizer_self, $current_package
|
||
# %is_constant,
|
||
my $HERE_DOC_WINDOW = 40;
|
||
|
||
my $next_token = shift;
|
||
my $here_doc_expected = 0;
|
||
my $line;
|
||
my $k = 0;
|
||
my $msg = "checking <<";
|
||
|
||
while ( $line =
|
||
$tokenizer_self->[_line_buffer_object_]->peek_ahead( $k++ ) )
|
||
{
|
||
chomp $line;
|
||
|
||
if ( $line =~ /^$next_token$/ ) {
|
||
$msg .= " -- found target $next_token ahead $k lines\n";
|
||
$here_doc_expected = 1; # got it
|
||
last;
|
||
}
|
||
last if ( $k >= $HERE_DOC_WINDOW );
|
||
}
|
||
|
||
unless ($here_doc_expected) {
|
||
|
||
if ( !defined($line) ) {
|
||
$here_doc_expected = -1; # hit eof without seeing target
|
||
$msg .= " -- must be shift; target $next_token not in file\n";
|
||
|
||
}
|
||
else { # still unsure..taking a wild guess
|
||
|
||
if ( !$is_constant{$current_package}{$next_token} ) {
|
||
$here_doc_expected = 1;
|
||
$msg .=
|
||
" -- guessing it's a here-doc ($next_token not a constant)\n";
|
||
}
|
||
else {
|
||
$msg .=
|
||
" -- guessing it's a shift ($next_token is a constant)\n";
|
||
}
|
||
}
|
||
}
|
||
write_logfile_entry($msg);
|
||
return $here_doc_expected;
|
||
}
|
||
|
||
#########i#############################################################
|
||
# Tokenizer Routines for scanning identifiers and related items
|
||
#######################################################################
|
||
|
||
sub scan_bare_identifier_do {
|
||
|
||
# this routine is called to scan a token starting with an alphanumeric
|
||
# variable or package separator, :: or '.
|
||
# USES GLOBAL VARIABLES: $current_package, $last_nonblank_token,
|
||
# $last_nonblank_type,@paren_type, $paren_depth
|
||
|
||
my ( $input_line, $i, $tok, $type, $prototype, $rtoken_map,
|
||
$max_token_index )
|
||
= @_;
|
||
my $i_begin = $i;
|
||
my $package = undef;
|
||
|
||
my $i_beg = $i;
|
||
|
||
# we have to back up one pretoken at a :: since each : is one pretoken
|
||
if ( $tok eq '::' ) { $i_beg-- }
|
||
if ( $tok eq '->' ) { $i_beg-- }
|
||
my $pos_beg = $rtoken_map->[$i_beg];
|
||
pos($input_line) = $pos_beg;
|
||
|
||
# Examples:
|
||
# A::B::C
|
||
# A::
|
||
# ::A
|
||
# A'B
|
||
if ( $input_line =~ m/\G\s*((?:\w*(?:'|::)))*(?:(?:->)?(\w+))?/gc ) {
|
||
|
||
my $pos = pos($input_line);
|
||
my $numc = $pos - $pos_beg;
|
||
$tok = substr( $input_line, $pos_beg, $numc );
|
||
|
||
# type 'w' includes anything without leading type info
|
||
# ($,%,@,*) including something like abc::def::ghi
|
||
$type = 'w';
|
||
|
||
my $sub_name = "";
|
||
if ( defined($2) ) { $sub_name = $2; }
|
||
if ( defined($1) ) {
|
||
$package = $1;
|
||
|
||
# patch: don't allow isolated package name which just ends
|
||
# in the old style package separator (single quote). Example:
|
||
# use CGI':all';
|
||
if ( !($sub_name) && substr( $package, -1, 1 ) eq '\'' ) {
|
||
$pos--;
|
||
}
|
||
|
||
$package =~ s/\'/::/g;
|
||
if ( $package =~ /^\:/ ) { $package = 'main' . $package }
|
||
$package =~ s/::$//;
|
||
}
|
||
else {
|
||
$package = $current_package;
|
||
|
||
if ( $is_keyword{$tok} ) {
|
||
$type = 'k';
|
||
}
|
||
}
|
||
|
||
# if it is a bareword..
|
||
if ( $type eq 'w' ) {
|
||
|
||
# check for v-string with leading 'v' type character
|
||
# (This seems to have precedence over filehandle, type 'Y')
|
||
if ( $tok =~ /^v\d[_\d]*$/ ) {
|
||
|
||
# we only have the first part - something like 'v101' -
|
||
# look for more
|
||
if ( $input_line =~ m/\G(\.\d[_\d]*)+/gc ) {
|
||
$pos = pos($input_line);
|
||
$numc = $pos - $pos_beg;
|
||
$tok = substr( $input_line, $pos_beg, $numc );
|
||
}
|
||
$type = 'v';
|
||
|
||
# warn if this version can't handle v-strings
|
||
report_v_string($tok);
|
||
}
|
||
|
||
elsif ( $is_constant{$package}{$sub_name} ) {
|
||
$type = 'C';
|
||
}
|
||
|
||
# bareword after sort has implied empty prototype; for example:
|
||
# @sorted = sort numerically ( 53, 29, 11, 32, 7 );
|
||
# This has priority over whatever the user has specified.
|
||
elsif ($last_nonblank_token eq 'sort'
|
||
&& $last_nonblank_type eq 'k' )
|
||
{
|
||
$type = 'Z';
|
||
}
|
||
|
||
# Note: strangely, perl does not seem to really let you create
|
||
# functions which act like eval and do, in the sense that eval
|
||
# and do may have operators following the final }, but any operators
|
||
# that you create with prototype (&) apparently do not allow
|
||
# trailing operators, only terms. This seems strange.
|
||
# If this ever changes, here is the update
|
||
# to make perltidy behave accordingly:
|
||
|
||
# elsif ( $is_block_function{$package}{$tok} ) {
|
||
# $tok='eval'; # patch to do braces like eval - doesn't work
|
||
# $type = 'k';
|
||
#}
|
||
# FIXME: This could become a separate type to allow for different
|
||
# future behavior:
|
||
elsif ( $is_block_function{$package}{$sub_name} ) {
|
||
$type = 'G';
|
||
}
|
||
elsif ( $is_block_list_function{$package}{$sub_name} ) {
|
||
$type = 'G';
|
||
}
|
||
elsif ( $is_user_function{$package}{$sub_name} ) {
|
||
$type = 'U';
|
||
$prototype = $user_function_prototype{$package}{$sub_name};
|
||
}
|
||
|
||
# check for indirect object
|
||
elsif (
|
||
|
||
# added 2001-03-27: must not be followed immediately by '('
|
||
# see fhandle.t
|
||
( $input_line !~ m/\G\(/gc )
|
||
|
||
# and
|
||
&& (
|
||
|
||
# preceded by keyword like 'print', 'printf' and friends
|
||
$is_indirect_object_taker{$last_nonblank_token}
|
||
|
||
# or preceded by something like 'print(' or 'printf('
|
||
|| (
|
||
( $last_nonblank_token eq '(' )
|
||
&& $is_indirect_object_taker{ $paren_type[$paren_depth]
|
||
}
|
||
|
||
)
|
||
)
|
||
)
|
||
{
|
||
|
||
# may not be indirect object unless followed by a space
|
||
if ( $input_line =~ m/\G\s+/gc ) {
|
||
$type = 'Y';
|
||
|
||
# Abandon Hope ...
|
||
# Perl's indirect object notation is a very bad
|
||
# thing and can cause subtle bugs, especially for
|
||
# beginning programmers. And I haven't even been
|
||
# able to figure out a sane warning scheme which
|
||
# doesn't get in the way of good scripts.
|
||
|
||
# Complain if a filehandle has any lower case
|
||
# letters. This is suggested good practice.
|
||
# Use 'sub_name' because something like
|
||
# main::MYHANDLE is ok for filehandle
|
||
if ( $sub_name =~ /[a-z]/ ) {
|
||
|
||
# could be bug caused by older perltidy if
|
||
# followed by '('
|
||
if ( $input_line =~ m/\G\s*\(/gc ) {
|
||
complain(
|
||
"Caution: unknown word '$tok' in indirect object slot\n"
|
||
);
|
||
}
|
||
}
|
||
}
|
||
|
||
# bareword not followed by a space -- may not be filehandle
|
||
# (may be function call defined in a 'use' statement)
|
||
else {
|
||
$type = 'Z';
|
||
}
|
||
}
|
||
}
|
||
|
||
# Now we must convert back from character position
|
||
# to pre_token index.
|
||
# I don't think an error flag can occur here ..but who knows
|
||
my $error;
|
||
( $i, $error ) =
|
||
inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
|
||
if ($error) {
|
||
warning("scan_bare_identifier: Possibly invalid tokenization\n");
|
||
}
|
||
}
|
||
|
||
# no match but line not blank - could be syntax error
|
||
# perl will take '::' alone without complaint
|
||
else {
|
||
$type = 'w';
|
||
|
||
# change this warning to log message if it becomes annoying
|
||
warning("didn't find identifier after leading ::\n");
|
||
}
|
||
return ( $i, $tok, $type, $prototype );
|
||
}
|
||
|
||
sub scan_id_do {
|
||
|
||
# This is the new scanner and will eventually replace scan_identifier.
|
||
# Only type 'sub' and 'package' are implemented.
|
||
# Token types $ * % @ & -> are not yet implemented.
|
||
#
|
||
# Scan identifier following a type token.
|
||
# The type of call depends on $id_scan_state: $id_scan_state = ''
|
||
# for starting call, in which case $tok must be the token defining
|
||
# the type.
|
||
#
|
||
# If the type token is the last nonblank token on the line, a value
|
||
# of $id_scan_state = $tok is returned, indicating that further
|
||
# calls must be made to get the identifier. If the type token is
|
||
# not the last nonblank token on the line, the identifier is
|
||
# scanned and handled and a value of '' is returned.
|
||
# USES GLOBAL VARIABLES: $current_package, $last_nonblank_token, $in_attribute_list,
|
||
# $statement_type, $tokenizer_self
|
||
|
||
my ( $input_line, $i, $tok, $rtokens, $rtoken_map, $id_scan_state,
|
||
$max_token_index )
|
||
= @_;
|
||
use constant DEBUG_NSCAN => 0;
|
||
my $type = '';
|
||
my ( $i_beg, $pos_beg );
|
||
|
||
#print "NSCAN:entering i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
|
||
#my ($a,$b,$c) = caller;
|
||
#print "NSCAN: scan_id called with tok=$tok $a $b $c\n";
|
||
|
||
# on re-entry, start scanning at first token on the line
|
||
if ($id_scan_state) {
|
||
$i_beg = $i;
|
||
$type = '';
|
||
}
|
||
|
||
# on initial entry, start scanning just after type token
|
||
else {
|
||
$i_beg = $i + 1;
|
||
$id_scan_state = $tok;
|
||
$type = 't';
|
||
}
|
||
|
||
# find $i_beg = index of next nonblank token,
|
||
# and handle empty lines
|
||
my $blank_line = 0;
|
||
my $next_nonblank_token = $rtokens->[$i_beg];
|
||
if ( $i_beg > $max_token_index ) {
|
||
$blank_line = 1;
|
||
}
|
||
else {
|
||
|
||
# only a '#' immediately after a '$' is not a comment
|
||
if ( $next_nonblank_token eq '#' ) {
|
||
unless ( $tok eq '$' ) {
|
||
$blank_line = 1;
|
||
}
|
||
}
|
||
|
||
if ( $next_nonblank_token =~ /^\s/ ) {
|
||
( $next_nonblank_token, $i_beg ) =
|
||
find_next_nonblank_token_on_this_line( $i_beg, $rtokens,
|
||
$max_token_index );
|
||
if ( $next_nonblank_token =~ /(^#|^\s*$)/ ) {
|
||
$blank_line = 1;
|
||
}
|
||
}
|
||
}
|
||
|
||
# handle non-blank line; identifier, if any, must follow
|
||
unless ($blank_line) {
|
||
|
||
if ( $is_sub{$id_scan_state} ) {
|
||
( $i, $tok, $type, $id_scan_state ) = do_scan_sub(
|
||
{
|
||
input_line => $input_line,
|
||
i => $i,
|
||
i_beg => $i_beg,
|
||
tok => $tok,
|
||
type => $type,
|
||
rtokens => $rtokens,
|
||
rtoken_map => $rtoken_map,
|
||
id_scan_state => $id_scan_state,
|
||
max_token_index => $max_token_index
|
||
}
|
||
);
|
||
}
|
||
|
||
elsif ( $is_package{$id_scan_state} ) {
|
||
( $i, $tok, $type ) =
|
||
do_scan_package( $input_line, $i, $i_beg, $tok, $type, $rtokens,
|
||
$rtoken_map, $max_token_index );
|
||
$id_scan_state = '';
|
||
}
|
||
|
||
else {
|
||
warning("invalid token in scan_id: $tok\n");
|
||
$id_scan_state = '';
|
||
}
|
||
}
|
||
|
||
if ( $id_scan_state && ( !defined($type) || !$type ) ) {
|
||
|
||
# shouldn't happen:
|
||
warning(
|
||
"Program bug in scan_id: undefined type but scan_state=$id_scan_state\n"
|
||
);
|
||
report_definite_bug();
|
||
}
|
||
|
||
DEBUG_NSCAN && do {
|
||
print STDOUT
|
||
"NSCAN: returns i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
|
||
};
|
||
return ( $i, $tok, $type, $id_scan_state );
|
||
}
|
||
|
||
sub check_prototype {
|
||
my ( $proto, $package, $subname ) = @_;
|
||
return unless ( defined($package) && defined($subname) );
|
||
if ( defined($proto) ) {
|
||
$proto =~ s/^\s*\(\s*//;
|
||
$proto =~ s/\s*\)$//;
|
||
if ($proto) {
|
||
$is_user_function{$package}{$subname} = 1;
|
||
$user_function_prototype{$package}{$subname} = "($proto)";
|
||
|
||
# prototypes containing '&' must be treated specially..
|
||
if ( $proto =~ /\&/ ) {
|
||
|
||
# right curly braces of prototypes ending in
|
||
# '&' may be followed by an operator
|
||
if ( $proto =~ /\&$/ ) {
|
||
$is_block_function{$package}{$subname} = 1;
|
||
}
|
||
|
||
# right curly braces of prototypes NOT ending in
|
||
# '&' may NOT be followed by an operator
|
||
elsif ( $proto !~ /\&$/ ) {
|
||
$is_block_list_function{$package}{$subname} = 1;
|
||
}
|
||
}
|
||
}
|
||
else {
|
||
$is_constant{$package}{$subname} = 1;
|
||
}
|
||
}
|
||
else {
|
||
$is_user_function{$package}{$subname} = 1;
|
||
}
|
||
return;
|
||
}
|
||
|
||
sub do_scan_package {
|
||
|
||
# do_scan_package parses a package name
|
||
# it is called with $i_beg equal to the index of the first nonblank
|
||
# token following a 'package' token.
|
||
# USES GLOBAL VARIABLES: $current_package,
|
||
|
||
# package NAMESPACE
|
||
# package NAMESPACE VERSION
|
||
# package NAMESPACE BLOCK
|
||
# package NAMESPACE VERSION BLOCK
|
||
#
|
||
# If VERSION is provided, package sets the $VERSION variable in the given
|
||
# namespace to a version object with the VERSION provided. VERSION must be
|
||
# a "strict" style version number as defined by the version module: a
|
||
# positive decimal number (integer or decimal-fraction) without
|
||
# exponentiation or else a dotted-decimal v-string with a leading 'v'
|
||
# character and at least three components.
|
||
# reference http://perldoc.perl.org/functions/package.html
|
||
|
||
my ( $input_line, $i, $i_beg, $tok, $type, $rtokens, $rtoken_map,
|
||
$max_token_index )
|
||
= @_;
|
||
my $package = undef;
|
||
my $pos_beg = $rtoken_map->[$i_beg];
|
||
pos($input_line) = $pos_beg;
|
||
|
||
# handle non-blank line; package name, if any, must follow
|
||
if ( $input_line =~ m/\G\s*((?:\w*(?:'|::))*\w*)/gc ) {
|
||
$package = $1;
|
||
$package = ( defined($1) && $1 ) ? $1 : 'main';
|
||
$package =~ s/\'/::/g;
|
||
if ( $package =~ /^\:/ ) { $package = 'main' . $package }
|
||
$package =~ s/::$//;
|
||
my $pos = pos($input_line);
|
||
my $numc = $pos - $pos_beg;
|
||
$tok = 'package ' . substr( $input_line, $pos_beg, $numc );
|
||
$type = 'i';
|
||
|
||
# Now we must convert back from character position
|
||
# to pre_token index.
|
||
# I don't think an error flag can occur here ..but ?
|
||
my $error;
|
||
( $i, $error ) =
|
||
inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
|
||
if ($error) { warning("Possibly invalid package\n") }
|
||
$current_package = $package;
|
||
|
||
# we should now have package NAMESPACE
|
||
# now expecting VERSION, BLOCK, or ; to follow ...
|
||
# package NAMESPACE VERSION
|
||
# package NAMESPACE BLOCK
|
||
# package NAMESPACE VERSION BLOCK
|
||
my ( $next_nonblank_token, $i_next ) =
|
||
find_next_nonblank_token( $i, $rtokens, $max_token_index );
|
||
|
||
# check that something recognizable follows, but do not parse.
|
||
# A VERSION number will be parsed later as a number or v-string in the
|
||
# normal way. What is important is to set the statement type if
|
||
# everything looks okay so that the operator_expected() routine
|
||
# knows that the number is in a package statement.
|
||
# Examples of valid primitive tokens that might follow are:
|
||
# 1235 . ; { } v3 v
|
||
if ( $next_nonblank_token =~ /^([v\.\d;\{\}])|v\d|\d+$/ ) {
|
||
$statement_type = $tok;
|
||
}
|
||
else {
|
||
warning(
|
||
"Unexpected '$next_nonblank_token' after package name '$tok'\n"
|
||
);
|
||
}
|
||
}
|
||
|
||
# no match but line not blank --
|
||
# could be a label with name package, like package: , for example.
|
||
else {
|
||
$type = 'k';
|
||
}
|
||
|
||
return ( $i, $tok, $type );
|
||
}
|
||
|
||
sub scan_identifier_do {
|
||
|
||
# This routine assembles tokens into identifiers. It maintains a
|
||
# scan state, id_scan_state. It updates id_scan_state based upon
|
||
# current id_scan_state and token, and returns an updated
|
||
# id_scan_state and the next index after the identifier.
|
||
|
||
# USES GLOBAL VARIABLES: $context, $last_nonblank_token,
|
||
# $last_nonblank_type
|
||
|
||
my ( $i, $id_scan_state, $identifier, $rtokens, $max_token_index,
|
||
$expecting, $container_type )
|
||
= @_;
|
||
use constant DEBUG_SCAN_ID => 0;
|
||
my $i_begin = $i;
|
||
my $type = '';
|
||
my $tok_begin = $rtokens->[$i_begin];
|
||
if ( $tok_begin eq ':' ) { $tok_begin = '::' }
|
||
my $id_scan_state_begin = $id_scan_state;
|
||
my $identifier_begin = $identifier;
|
||
my $tok = $tok_begin;
|
||
my $message = "";
|
||
my $tok_is_blank; # a flag to speed things up
|
||
|
||
my $in_prototype_or_signature =
|
||
$container_type && $container_type =~ /^sub\b/;
|
||
|
||
# these flags will be used to help figure out the type:
|
||
my $saw_alpha;
|
||
my $saw_type;
|
||
|
||
# allow old package separator (') except in 'use' statement
|
||
my $allow_tick = ( $last_nonblank_token ne 'use' );
|
||
|
||
#########################################################
|
||
# get started by defining a type and a state if necessary
|
||
#########################################################
|
||
|
||
if ( !$id_scan_state ) {
|
||
$context = UNKNOWN_CONTEXT;
|
||
|
||
# fixup for digraph
|
||
if ( $tok eq '>' ) {
|
||
$tok = '->';
|
||
$tok_begin = $tok;
|
||
}
|
||
$identifier = $tok;
|
||
|
||
if ( $tok eq '$' || $tok eq '*' ) {
|
||
$id_scan_state = '$';
|
||
$context = SCALAR_CONTEXT;
|
||
}
|
||
elsif ( $tok eq '%' || $tok eq '@' ) {
|
||
$id_scan_state = '$';
|
||
$context = LIST_CONTEXT;
|
||
}
|
||
elsif ( $tok eq '&' ) {
|
||
$id_scan_state = '&';
|
||
}
|
||
elsif ( $tok eq 'sub' or $tok eq 'package' ) {
|
||
$saw_alpha = 0; # 'sub' is considered type info here
|
||
$id_scan_state = '$';
|
||
$identifier .= ' '; # need a space to separate sub from sub name
|
||
}
|
||
elsif ( $tok eq '::' ) {
|
||
$id_scan_state = 'A';
|
||
}
|
||
elsif ( $tok =~ /^\w/ ) {
|
||
$id_scan_state = ':';
|
||
$saw_alpha = 1;
|
||
}
|
||
elsif ( $tok eq '->' ) {
|
||
$id_scan_state = '$';
|
||
}
|
||
else {
|
||
|
||
# shouldn't happen
|
||
my ( $a, $b, $c ) = caller;
|
||
warning("Program Bug: scan_identifier given bad token = $tok \n");
|
||
warning(" called from sub $a line: $c\n");
|
||
report_definite_bug();
|
||
}
|
||
$saw_type = !$saw_alpha;
|
||
}
|
||
else {
|
||
$i--;
|
||
$saw_alpha = ( $tok =~ /^\w/ );
|
||
$saw_type = ( $tok =~ /([\$\%\@\*\&])/ );
|
||
}
|
||
|
||
###############################
|
||
# loop to gather the identifier
|
||
###############################
|
||
|
||
my $i_save = $i;
|
||
|
||
while ( $i < $max_token_index ) {
|
||
my $last_tok_is_blank = $tok_is_blank;
|
||
if ($tok_is_blank) { $tok_is_blank = undef }
|
||
else { $i_save = $i }
|
||
|
||
$tok = $rtokens->[ ++$i ];
|
||
|
||
# patch to make digraph :: if necessary
|
||
if ( ( $tok eq ':' ) && ( $rtokens->[ $i + 1 ] eq ':' ) ) {
|
||
$tok = '::';
|
||
$i++;
|
||
}
|
||
|
||
########################
|
||
# Starting variable name
|
||
########################
|
||
|
||
if ( $id_scan_state eq '$' ) {
|
||
|
||
if ( $tok eq '$' ) {
|
||
|
||
$identifier .= $tok;
|
||
|
||
# we've got a punctuation variable if end of line (punct.t)
|
||
if ( $i == $max_token_index ) {
|
||
$type = 'i';
|
||
$id_scan_state = '';
|
||
last;
|
||
}
|
||
}
|
||
elsif ( $tok =~ /^\w/ ) { # alphanumeric ..
|
||
$saw_alpha = 1;
|
||
$id_scan_state = ':'; # now need ::
|
||
$identifier .= $tok;
|
||
}
|
||
elsif ( $tok eq '::' ) {
|
||
$id_scan_state = 'A';
|
||
$identifier .= $tok;
|
||
}
|
||
|
||
# POSTDEFREF ->@ ->% ->& ->*
|
||
elsif ( ( $tok =~ /^[\@\%\&\*]$/ ) && $identifier =~ /\-\>$/ ) {
|
||
$identifier .= $tok;
|
||
}
|
||
elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric ..
|
||
$saw_alpha = 1;
|
||
$id_scan_state = ':'; # now need ::
|
||
$identifier .= $tok;
|
||
|
||
# Perl will accept leading digits in identifiers,
|
||
# although they may not always produce useful results.
|
||
# Something like $main::0 is ok. But this also works:
|
||
#
|
||
# sub howdy::123::bubba{ print "bubba $54321!\n" }
|
||
# howdy::123::bubba();
|
||
#
|
||
}
|
||
|
||
# $# and POSTDEFREF ->$#
|
||
elsif (
|
||
( $tok eq '#' )
|
||
&& ( $identifier =~ /\$$/ )
|
||
|
||
# a # inside a prototype or signature can only start a comment
|
||
&& !$in_prototype_or_signature
|
||
)
|
||
{
|
||
# A '#' starts a comment if it follows a space. For example,
|
||
# the following is equivalent to $ans=40.
|
||
# my $ #
|
||
# ans = 40;
|
||
if ($last_tok_is_blank) {
|
||
$type = 'i';
|
||
if ( $id_scan_state eq '$' ) { $type = 't' }
|
||
$i = $i_save;
|
||
$id_scan_state = '';
|
||
last;
|
||
}
|
||
|
||
# May be '$#' or '$#array'
|
||
$identifier .= $tok; # keep same state, a $ could follow
|
||
}
|
||
|
||
elsif ( $tok eq '{' ) {
|
||
|
||
# check for something like ${#} or ${<7B>}
|
||
if (
|
||
(
|
||
$identifier eq '$'
|
||
|| $identifier eq '@'
|
||
|| $identifier eq '$#'
|
||
)
|
||
&& $i + 2 <= $max_token_index
|
||
&& $rtokens->[ $i + 2 ] eq '}'
|
||
&& $rtokens->[ $i + 1 ] !~ /[\s\w]/
|
||
)
|
||
{
|
||
my $next2 = $rtokens->[ $i + 2 ];
|
||
my $next1 = $rtokens->[ $i + 1 ];
|
||
$identifier .= $tok . $next1 . $next2;
|
||
$i += 2;
|
||
$id_scan_state = '';
|
||
last;
|
||
}
|
||
|
||
# skip something like ${xxx} or ->{
|
||
$id_scan_state = '';
|
||
|
||
# if this is the first token of a line, any tokens for this
|
||
# identifier have already been accumulated
|
||
if ( $identifier eq '$' || $i == 0 ) { $identifier = ''; }
|
||
$i = $i_save;
|
||
last;
|
||
}
|
||
|
||
# space ok after leading $ % * & @
|
||
elsif ( $tok =~ /^\s*$/ ) {
|
||
|
||
$tok_is_blank = 1;
|
||
|
||
if ( $identifier =~ /^[\$\%\*\&\@]/ ) {
|
||
|
||
if ( length($identifier) > 1 ) {
|
||
$id_scan_state = '';
|
||
$i = $i_save;
|
||
$type = 'i'; # probably punctuation variable
|
||
last;
|
||
}
|
||
else {
|
||
|
||
# spaces after $'s are common, and space after @
|
||
# is harmless, so only complain about space
|
||
# after other type characters. Space after $ and
|
||
# @ will be removed in formatting. Report space
|
||
# after % and * because they might indicate a
|
||
# parsing error. In other words '% ' might be a
|
||
# modulo operator. Delete this warning if it
|
||
# gets annoying.
|
||
if ( $identifier !~ /^[\@\$]$/ ) {
|
||
$message =
|
||
"Space in identifier, following $identifier\n";
|
||
}
|
||
}
|
||
}
|
||
|
||
# else:
|
||
# space after '->' is ok
|
||
}
|
||
elsif ( $tok eq '^' ) {
|
||
|
||
# check for some special variables like $^W
|
||
if ( $identifier =~ /^[\$\*\@\%]$/ ) {
|
||
$identifier .= $tok;
|
||
$id_scan_state = 'A';
|
||
|
||
# Perl accepts '$^]' or '@^]', but
|
||
# there must not be a space before the ']'.
|
||
my $next1 = $rtokens->[ $i + 1 ];
|
||
if ( $next1 eq ']' ) {
|
||
$i++;
|
||
$identifier .= $next1;
|
||
$id_scan_state = "";
|
||
last;
|
||
}
|
||
}
|
||
else {
|
||
$id_scan_state = '';
|
||
}
|
||
}
|
||
else { # something else
|
||
|
||
if ( $in_prototype_or_signature && $tok =~ /^[\),=#]/ ) {
|
||
|
||
# We might be in an extrusion of
|
||
# sub foo2 ( $first, $, $third ) {
|
||
# looking at a line starting with a comma, like
|
||
# $
|
||
# ,
|
||
# in this case the comma ends the signature variable
|
||
# '$' which will have been previously marked type 't'
|
||
# rather than 'i'.
|
||
if ( $i == $i_begin ) {
|
||
$identifier = "";
|
||
$type = "";
|
||
}
|
||
|
||
# at a # we have to mark as type 't' because more may
|
||
# follow, otherwise, in a signature we can let '$' be an
|
||
# identifier here for better formatting.
|
||
# See 'mangle4.in' for a test case.
|
||
else {
|
||
$type = 'i';
|
||
if ( $id_scan_state eq '$' && $tok eq '#' ) {
|
||
$type = 't';
|
||
}
|
||
$i = $i_save;
|
||
}
|
||
$id_scan_state = '';
|
||
last;
|
||
}
|
||
|
||
# check for various punctuation variables
|
||
if ( $identifier =~ /^[\$\*\@\%]$/ ) {
|
||
$identifier .= $tok;
|
||
}
|
||
|
||
# POSTDEFREF: Postfix reference ->$* ->%* ->@* ->** ->&* ->$#*
|
||
elsif ($tok eq '*'
|
||
&& $identifier =~ /\-\>([\@\%\$\*\&]|\$\#)$/ )
|
||
{
|
||
$identifier .= $tok;
|
||
}
|
||
|
||
elsif ( $identifier eq '$#' ) {
|
||
|
||
if ( $tok eq '{' ) { $type = 'i'; $i = $i_save }
|
||
|
||
# perl seems to allow just these: $#: $#- $#+
|
||
elsif ( $tok =~ /^[\:\-\+]$/ ) {
|
||
$type = 'i';
|
||
$identifier .= $tok;
|
||
}
|
||
else {
|
||
$i = $i_save;
|
||
write_logfile_entry( 'Use of $# is deprecated' . "\n" );
|
||
}
|
||
}
|
||
elsif ( $identifier eq '$$' ) {
|
||
|
||
# perl does not allow references to punctuation
|
||
# variables without braces. For example, this
|
||
# won't work:
|
||
# $:=\4;
|
||
# $a = $$:;
|
||
# You would have to use
|
||
# $a = ${$:};
|
||
|
||
# '$$' alone is punctuation variable for PID
|
||
$i = $i_save;
|
||
if ( $tok eq '{' ) { $type = 't' }
|
||
else { $type = 'i' }
|
||
}
|
||
elsif ( $identifier eq '->' ) {
|
||
$i = $i_save;
|
||
}
|
||
else {
|
||
$i = $i_save;
|
||
if ( length($identifier) == 1 ) { $identifier = ''; }
|
||
}
|
||
$id_scan_state = '';
|
||
last;
|
||
}
|
||
}
|
||
|
||
###################################
|
||
# looking for alphanumeric after ::
|
||
###################################
|
||
|
||
elsif ( $id_scan_state eq 'A' ) {
|
||
|
||
$tok_is_blank = $tok =~ /^\s*$/;
|
||
|
||
if ( $tok =~ /^\w/ ) { # found it
|
||
$identifier .= $tok;
|
||
$id_scan_state = ':'; # now need ::
|
||
$saw_alpha = 1;
|
||
}
|
||
elsif ( $tok eq "'" && $allow_tick ) {
|
||
$identifier .= $tok;
|
||
$id_scan_state = ':'; # now need ::
|
||
$saw_alpha = 1;
|
||
}
|
||
elsif ( $tok_is_blank && $identifier =~ /^sub / ) {
|
||
$id_scan_state = '(';
|
||
$identifier .= $tok;
|
||
}
|
||
elsif ( $tok eq '(' && $identifier =~ /^sub / ) {
|
||
$id_scan_state = ')';
|
||
$identifier .= $tok;
|
||
}
|
||
else {
|
||
$id_scan_state = '';
|
||
$i = $i_save;
|
||
last;
|
||
}
|
||
}
|
||
|
||
###################################
|
||
# looking for :: after alphanumeric
|
||
###################################
|
||
|
||
elsif ( $id_scan_state eq ':' ) { # looking for :: after alpha
|
||
|
||
$tok_is_blank = $tok =~ /^\s*$/;
|
||
|
||
if ( $tok eq '::' ) { # got it
|
||
$identifier .= $tok;
|
||
$id_scan_state = 'A'; # now require alpha
|
||
}
|
||
elsif ( $tok =~ /^\w/ ) { # more alphanumeric is ok here
|
||
$identifier .= $tok;
|
||
$id_scan_state = ':'; # now need ::
|
||
$saw_alpha = 1;
|
||
}
|
||
elsif ( $tok eq "'" && $allow_tick ) { # tick
|
||
|
||
if ( $is_keyword{$identifier} ) {
|
||
$id_scan_state = ''; # that's all
|
||
$i = $i_save;
|
||
}
|
||
else {
|
||
$identifier .= $tok;
|
||
}
|
||
}
|
||
elsif ( $tok_is_blank && $identifier =~ /^sub / ) {
|
||
$id_scan_state = '(';
|
||
$identifier .= $tok;
|
||
}
|
||
elsif ( $tok eq '(' && $identifier =~ /^sub / ) {
|
||
$id_scan_state = ')';
|
||
$identifier .= $tok;
|
||
}
|
||
else {
|
||
$id_scan_state = ''; # that's all
|
||
$i = $i_save;
|
||
last;
|
||
}
|
||
}
|
||
|
||
##############################
|
||
# looking for '(' of prototype
|
||
##############################
|
||
|
||
elsif ( $id_scan_state eq '(' ) {
|
||
|
||
if ( $tok eq '(' ) { # got it
|
||
$identifier .= $tok;
|
||
$id_scan_state = ')'; # now find the end of it
|
||
}
|
||
elsif ( $tok =~ /^\s*$/ ) { # blank - keep going
|
||
$identifier .= $tok;
|
||
$tok_is_blank = 1;
|
||
}
|
||
else {
|
||
$id_scan_state = ''; # that's all - no prototype
|
||
$i = $i_save;
|
||
last;
|
||
}
|
||
}
|
||
|
||
##############################
|
||
# looking for ')' of prototype
|
||
##############################
|
||
|
||
elsif ( $id_scan_state eq ')' ) {
|
||
|
||
$tok_is_blank = $tok =~ /^\s*$/;
|
||
|
||
if ( $tok eq ')' ) { # got it
|
||
$identifier .= $tok;
|
||
$id_scan_state = ''; # all done
|
||
last;
|
||
}
|
||
elsif ( $tok =~ /^[\s\$\%\\\*\@\&\;]/ ) {
|
||
$identifier .= $tok;
|
||
}
|
||
else { # probable error in script, but keep going
|
||
warning("Unexpected '$tok' while seeking end of prototype\n");
|
||
$identifier .= $tok;
|
||
}
|
||
}
|
||
|
||
###################
|
||
# Starting sub call
|
||
###################
|
||
|
||
elsif ( $id_scan_state eq '&' ) {
|
||
|
||
if ( $tok =~ /^[\$\w]/ ) { # alphanumeric ..
|
||
$id_scan_state = ':'; # now need ::
|
||
$saw_alpha = 1;
|
||
$identifier .= $tok;
|
||
}
|
||
elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric ..
|
||
$id_scan_state = ':'; # now need ::
|
||
$saw_alpha = 1;
|
||
$identifier .= $tok;
|
||
}
|
||
elsif ( $tok =~ /^\s*$/ ) { # allow space
|
||
$tok_is_blank = 1;
|
||
}
|
||
elsif ( $tok eq '::' ) { # leading ::
|
||
$id_scan_state = 'A'; # accept alpha next
|
||
$identifier .= $tok;
|
||
}
|
||
elsif ( $tok eq '{' ) {
|
||
if ( $identifier eq '&' || $i == 0 ) { $identifier = ''; }
|
||
$i = $i_save;
|
||
$id_scan_state = '';
|
||
last;
|
||
}
|
||
else {
|
||
|
||
# punctuation variable?
|
||
# testfile: cunningham4.pl
|
||
#
|
||
# We have to be careful here. If we are in an unknown state,
|
||
# we will reject the punctuation variable. In the following
|
||
# example the '&' is a binary operator but we are in an unknown
|
||
# state because there is no sigil on 'Prima', so we don't
|
||
# know what it is. But it is a bad guess that
|
||
# '&~' is a function variable.
|
||
# $self->{text}->{colorMap}->[
|
||
# Prima::PodView::COLOR_CODE_FOREGROUND
|
||
# & ~tb::COLOR_INDEX ] =
|
||
# $sec->{ColorCode}
|
||
if ( $identifier eq '&' && $expecting ) {
|
||
$identifier .= $tok;
|
||
}
|
||
else {
|
||
$identifier = '';
|
||
$i = $i_save;
|
||
$type = '&';
|
||
}
|
||
$id_scan_state = '';
|
||
last;
|
||
}
|
||
}
|
||
|
||
######################
|
||
# unknown state - quit
|
||
######################
|
||
|
||
else { # can get here due to error in initialization
|
||
$id_scan_state = '';
|
||
$i = $i_save;
|
||
last;
|
||
}
|
||
} ## end of main loop
|
||
|
||
if ( $id_scan_state eq ')' ) {
|
||
warning("Hit end of line while seeking ) to end prototype\n");
|
||
}
|
||
|
||
# once we enter the actual identifier, it may not extend beyond
|
||
# the end of the current line
|
||
if ( $id_scan_state =~ /^[A\:\(\)]/ ) {
|
||
$id_scan_state = '';
|
||
}
|
||
|
||
# Patch: the deprecated variable $# does not combine with anything on the
|
||
# next line.
|
||
if ( $identifier eq '$#' ) { $id_scan_state = '' }
|
||
|
||
if ( $i < 0 ) { $i = 0 }
|
||
|
||
# Be sure a token type is defined
|
||
if ( !$type ) {
|
||
|
||
if ($saw_type) {
|
||
|
||
if ($saw_alpha) {
|
||
if ( $identifier =~ /^->/ && $last_nonblank_type eq 'w' ) {
|
||
$type = 'w';
|
||
}
|
||
else { $type = 'i' }
|
||
}
|
||
elsif ( $identifier eq '->' ) {
|
||
$type = '->';
|
||
}
|
||
elsif (
|
||
( length($identifier) > 1 )
|
||
|
||
# In something like '@$=' we have an identifier '@$'
|
||
# In something like '$${' we have type '$$' (and only
|
||
# part of an identifier)
|
||
&& !( $identifier =~ /\$$/ && $tok eq '{' )
|
||
&& ( $identifier !~ /^(sub |package )$/ )
|
||
)
|
||
{
|
||
$type = 'i';
|
||
}
|
||
else { $type = 't' }
|
||
}
|
||
elsif ($saw_alpha) {
|
||
|
||
# type 'w' includes anything without leading type info
|
||
# ($,%,@,*) including something like abc::def::ghi
|
||
$type = 'w';
|
||
}
|
||
else {
|
||
$type = '';
|
||
} # this can happen on a restart
|
||
}
|
||
|
||
# See if we formed an identifier...
|
||
if ($identifier) {
|
||
$tok = $identifier;
|
||
if ($message) { write_logfile_entry($message) }
|
||
}
|
||
|
||
# did not find an identifier, back up
|
||
else {
|
||
$tok = $tok_begin;
|
||
$i = $i_begin;
|
||
}
|
||
|
||
DEBUG_SCAN_ID && do {
|
||
my ( $a, $b, $c ) = caller;
|
||
print STDOUT
|
||
"SCANID: called from $a $b $c with tok, i, state, identifier =$tok_begin, $i_begin, $id_scan_state_begin, $identifier_begin\n";
|
||
print STDOUT
|
||
"SCANID: returned with tok, i, state, identifier =$tok, $i, $id_scan_state, $identifier\n";
|
||
};
|
||
return ( $i, $tok, $type, $id_scan_state, $identifier );
|
||
}
|
||
|
||
{ ## closure for sub do_scan_sub
|
||
|
||
# saved package and subnames in case prototype is on separate line
|
||
my ( $package_saved, $subname_saved );
|
||
|
||
# initialize subname each time a new 'sub' keyword is encountered
|
||
sub initialize_subname {
|
||
$package_saved = "";
|
||
$subname_saved = "";
|
||
return;
|
||
}
|
||
|
||
use constant {
|
||
SUB_CALL => 1,
|
||
PAREN_CALL => 2,
|
||
PROTOTYPE_CALL => 3,
|
||
};
|
||
|
||
sub do_scan_sub {
|
||
|
||
# do_scan_sub parses a sub name and prototype.
|
||
|
||
# At present there are three basic CALL TYPES which are
|
||
# distinguished by the starting value of '$tok':
|
||
# 1. $tok='sub', id_scan_state='sub'
|
||
# it is called with $i_beg equal to the index of the first nonblank
|
||
# token following a 'sub' token.
|
||
# 2. $tok='(', id_scan_state='sub',
|
||
# it is called with $i_beg equal to the index of a '(' which may
|
||
# start a prototype.
|
||
# 3. $tok='prototype', id_scan_state='prototype'
|
||
# it is called with $i_beg equal to the index of a '(' which is
|
||
# preceded by ': prototype' and has $id_scan_state eq 'prototype'
|
||
|
||
# Examples:
|
||
|
||
# A single type 1 call will get both the sub and prototype
|
||
# sub foo1 ( $$ ) { }
|
||
# ^
|
||
|
||
# The subname will be obtained with a 'sub' call
|
||
# The prototype on line 2 will be obtained with a '(' call
|
||
# sub foo1
|
||
# ^ <---call type 1
|
||
# ( $$ ) { }
|
||
# ^ <---call type 2
|
||
|
||
# The subname will be obtained with a 'sub' call
|
||
# The prototype will be obtained with a 'prototype' call
|
||
# sub foo1 ( $x, $y ) : prototype ( $$ ) { }
|
||
# ^ <---type 1 ^ <---type 3
|
||
|
||
# TODO: add future error checks to be sure we have a valid
|
||
# sub name. For example, 'sub &doit' is wrong. Also, be sure
|
||
# a name is given if and only if a non-anonymous sub is
|
||
# appropriate.
|
||
# USES GLOBAL VARS: $current_package, $last_nonblank_token,
|
||
# $in_attribute_list, %saw_function_definition,
|
||
# $statement_type
|
||
|
||
my ($rinput_hash) = @_;
|
||
|
||
my $input_line = $rinput_hash->{input_line};
|
||
my $i = $rinput_hash->{i};
|
||
my $i_beg = $rinput_hash->{i_beg};
|
||
my $tok = $rinput_hash->{tok};
|
||
my $type = $rinput_hash->{type};
|
||
my $rtokens = $rinput_hash->{rtokens};
|
||
my $rtoken_map = $rinput_hash->{rtoken_map};
|
||
my $id_scan_state = $rinput_hash->{id_scan_state};
|
||
my $max_token_index = $rinput_hash->{max_token_index};
|
||
|
||
# Determine the CALL TYPE
|
||
# 1=sub
|
||
# 2=(
|
||
# 3=prototype
|
||
my $call_type =
|
||
$tok eq 'prototype' ? PROTOTYPE_CALL
|
||
: $tok eq '(' ? PAREN_CALL
|
||
: SUB_CALL;
|
||
|
||
$id_scan_state = ""; # normally we get everything in one call
|
||
my $subname = $subname_saved;
|
||
my $package = $package_saved;
|
||
my $proto = undef;
|
||
my $attrs = undef;
|
||
my $match;
|
||
|
||
my $pos_beg = $rtoken_map->[$i_beg];
|
||
pos($input_line) = $pos_beg;
|
||
|
||
# Look for the sub NAME if this is a SUB call
|
||
if (
|
||
$call_type == SUB_CALL
|
||
&& $input_line =~ m/\G\s*
|
||
((?:\w*(?:'|::))*) # package - something that ends in :: or '
|
||
(\w+) # NAME - required
|
||
/gcx
|
||
)
|
||
{
|
||
$match = 1;
|
||
$subname = $2;
|
||
|
||
$package = ( defined($1) && $1 ) ? $1 : $current_package;
|
||
$package =~ s/\'/::/g;
|
||
if ( $package =~ /^\:/ ) { $package = 'main' . $package }
|
||
$package =~ s/::$//;
|
||
my $pos = pos($input_line);
|
||
my $numc = $pos - $pos_beg;
|
||
$tok = 'sub ' . substr( $input_line, $pos_beg, $numc );
|
||
$type = 'i';
|
||
|
||
# remember the sub name in case another call is needed to
|
||
# get the prototype
|
||
$package_saved = $package;
|
||
$subname_saved = $subname;
|
||
}
|
||
|
||
# Now look for PROTO ATTRS for all call types
|
||
# Look for prototype/attributes which are usually on the same
|
||
# line as the sub name but which might be on a separate line.
|
||
# For example, we might have an anonymous sub with attributes,
|
||
# or a prototype on a separate line from its sub name
|
||
|
||
# NOTE: We only want to parse PROTOTYPES here. If we see anything that
|
||
# does not look like a prototype, we assume it is a SIGNATURE and we
|
||
# will stop and let the the standard tokenizer handle it. In
|
||
# particular, we stop if we see any nested parens, braces, or commas.
|
||
# Also note, a valid prototype cannot contain any alphabetic character
|
||
# -- see https://perldoc.perl.org/perlsub
|
||
# But it appears that an underscore is valid in a prototype, so the
|
||
# regex below uses [A-Za-z] rather than \w
|
||
# This is the old regex which has been replaced:
|
||
# $input_line =~ m/\G(\s*\([^\)\(\}\{\,#]*\))? # PROTO
|
||
my $saw_opening_paren = $input_line =~ /\G\s*\(/;
|
||
if (
|
||
$input_line =~ m/\G(\s*\([^\)\(\}\{\,#A-Za-z]*\))? # PROTO
|
||
(\s*:)? # ATTRS leading ':'
|
||
/gcx
|
||
&& ( $1 || $2 )
|
||
)
|
||
{
|
||
$proto = $1;
|
||
$attrs = $2;
|
||
|
||
# Append the prototype to the starting token if it is 'sub' or
|
||
# 'prototype'. This is not necessary but for compatibility with previous
|
||
# versions when the -csc flag is used:
|
||
if ( $proto && ( $match || $call_type == PROTOTYPE_CALL ) ) {
|
||
$tok .= $proto;
|
||
}
|
||
|
||
# If we just entered the sub at an opening paren on this call, not
|
||
# a following :prototype, label it with the previous token. This is
|
||
# necessary to propagate the sub name to its opening block.
|
||
elsif ( $call_type == PAREN_CALL ) {
|
||
$tok = $last_nonblank_token;
|
||
}
|
||
|
||
$match ||= 1;
|
||
$type = 'i';
|
||
}
|
||
|
||
if ($match) {
|
||
|
||
# ATTRS: if there are attributes, back up and let the ':' be
|
||
# found later by the scanner.
|
||
my $pos = pos($input_line);
|
||
if ($attrs) {
|
||
$pos -= length($attrs);
|
||
}
|
||
|
||
my $next_nonblank_token = $tok;
|
||
|
||
# catch case of line with leading ATTR ':' after anonymous sub
|
||
if ( $pos == $pos_beg && $tok eq ':' ) {
|
||
$type = 'A';
|
||
$in_attribute_list = 1;
|
||
}
|
||
|
||
# Otherwise, if we found a match we must convert back from
|
||
# string position to the pre_token index for continued parsing.
|
||
else {
|
||
|
||
# I don't think an error flag can occur here ..but ?
|
||
my $error;
|
||
( $i, $error ) = inverse_pretoken_map( $i, $pos, $rtoken_map,
|
||
$max_token_index );
|
||
if ($error) { warning("Possibly invalid sub\n") }
|
||
|
||
# check for multiple definitions of a sub
|
||
( $next_nonblank_token, my $i_next ) =
|
||
find_next_nonblank_token_on_this_line( $i, $rtokens,
|
||
$max_token_index );
|
||
}
|
||
|
||
if ( $next_nonblank_token =~ /^(\s*|#)$/ )
|
||
{ # skip blank or side comment
|
||
my ( $rpre_tokens, $rpre_types ) =
|
||
peek_ahead_for_n_nonblank_pre_tokens(1);
|
||
if ( defined($rpre_tokens) && @{$rpre_tokens} ) {
|
||
$next_nonblank_token = $rpre_tokens->[0];
|
||
}
|
||
else {
|
||
$next_nonblank_token = '}';
|
||
}
|
||
}
|
||
|
||
# See what's next...
|
||
if ( $next_nonblank_token eq '{' ) {
|
||
if ($subname) {
|
||
|
||
# Check for multiple definitions of a sub, but
|
||
# it is ok to have multiple sub BEGIN, etc,
|
||
# so we do not complain if name is all caps
|
||
if ( $saw_function_definition{$package}{$subname}
|
||
&& $subname !~ /^[A-Z]+$/ )
|
||
{
|
||
my $lno = $saw_function_definition{$package}{$subname};
|
||
warning(
|
||
"already saw definition of 'sub $subname' in package '$package' at line $lno\n"
|
||
);
|
||
}
|
||
$saw_function_definition{$package}{$subname} =
|
||
$tokenizer_self->[_last_line_number_];
|
||
}
|
||
}
|
||
elsif ( $next_nonblank_token eq ';' ) {
|
||
}
|
||
elsif ( $next_nonblank_token eq '}' ) {
|
||
}
|
||
|
||
# ATTRS - if an attribute list follows, remember the name
|
||
# of the sub so the next opening brace can be labeled.
|
||
# Setting 'statement_type' causes any ':'s to introduce
|
||
# attributes.
|
||
elsif ( $next_nonblank_token eq ':' ) {
|
||
$statement_type = $tok if ( $call_type == SUB_CALL );
|
||
}
|
||
|
||
# if we stopped before an open paren ...
|
||
elsif ( $next_nonblank_token eq '(' ) {
|
||
|
||
# If we DID NOT see this paren above then it must be on the
|
||
# next line so we will set a flag to come back here and see if
|
||
# it is a PROTOTYPE
|
||
|
||
# Otherwise, we assume it is a SIGNATURE rather than a
|
||
# PROTOTYPE and let the normal tokenizer handle it as a list
|
||
if ( !$saw_opening_paren ) {
|
||
$id_scan_state = 'sub'; # we must come back to get proto
|
||
}
|
||
$statement_type = $tok if ( $call_type == SUB_CALL );
|
||
}
|
||
elsif ($next_nonblank_token) { # EOF technically ok
|
||
$subname = "" unless defined($subname);
|
||
warning(
|
||
"expecting ':' or ';' or '{' after definition or declaration of sub '$subname' but saw '$next_nonblank_token'\n"
|
||
);
|
||
}
|
||
check_prototype( $proto, $package, $subname );
|
||
}
|
||
|
||
# no match to either sub name or prototype, but line not blank
|
||
else {
|
||
|
||
}
|
||
return ( $i, $tok, $type, $id_scan_state );
|
||
}
|
||
}
|
||
|
||
#########i###############################################################
|
||
# Tokenizer utility routines which may use CONSTANTS but no other GLOBALS
|
||
#########################################################################
|
||
|
||
sub find_next_nonblank_token {
|
||
my ( $i, $rtokens, $max_token_index ) = @_;
|
||
|
||
if ( $i >= $max_token_index ) {
|
||
if ( !peeked_ahead() ) {
|
||
peeked_ahead(1);
|
||
$rtokens =
|
||
peek_ahead_for_nonblank_token( $rtokens, $max_token_index );
|
||
}
|
||
}
|
||
my $next_nonblank_token = $rtokens->[ ++$i ];
|
||
|
||
if ( $next_nonblank_token =~ /^\s*$/ ) {
|
||
$next_nonblank_token = $rtokens->[ ++$i ];
|
||
}
|
||
return ( $next_nonblank_token, $i );
|
||
}
|
||
|
||
sub numerator_expected {
|
||
|
||
# this is a filter for a possible numerator, in support of guessing
|
||
# for the / pattern delimiter token.
|
||
# returns -
|
||
# 1 - yes
|
||
# 0 - can't tell
|
||
# -1 - no
|
||
# Note: I am using the convention that variables ending in
|
||
# _expected have these 3 possible values.
|
||
my ( $i, $rtokens, $max_token_index ) = @_;
|
||
my $numerator_expected = 0;
|
||
|
||
my $next_token = $rtokens->[ $i + 1 ];
|
||
if ( $next_token eq '=' ) { $i++; } # handle /=
|
||
my ( $next_nonblank_token, $i_next ) =
|
||
find_next_nonblank_token( $i, $rtokens, $max_token_index );
|
||
|
||
if ( $next_nonblank_token =~ /(\(|\$|\w|\.|\@)/ ) {
|
||
$numerator_expected = 1;
|
||
}
|
||
else {
|
||
|
||
if ( $next_nonblank_token =~ /^\s*$/ ) {
|
||
$numerator_expected = 0;
|
||
}
|
||
else {
|
||
$numerator_expected = -1;
|
||
}
|
||
}
|
||
return $numerator_expected;
|
||
}
|
||
|
||
{ ## closure for sub pattern_expected
|
||
my %pattern_test;
|
||
|
||
BEGIN {
|
||
|
||
# List of tokens which may follow a pattern. Note that we will not
|
||
# have formed digraphs at this point, so we will see '&' instead of
|
||
# '&&' and '|' instead of '||'
|
||
|
||
# /(\)|\}|\;|\&\&|\|\||and|or|while|if|unless)/
|
||
my @q = qw( & && | || ? : + - * and or while if unless);
|
||
push @q, ')', '}', ']', '>', ',', ';';
|
||
@{pattern_test}{@q} = (1) x scalar(@q);
|
||
}
|
||
|
||
sub pattern_expected {
|
||
|
||
# This a filter for a possible pattern.
|
||
# It looks at the token after a possible pattern and tries to
|
||
# determine if that token could end a pattern.
|
||
# returns -
|
||
# 1 - yes
|
||
# 0 - can't tell
|
||
# -1 - no
|
||
my ( $i, $rtokens, $max_token_index ) = @_;
|
||
my $is_pattern = 0;
|
||
|
||
my $next_token = $rtokens->[ $i + 1 ];
|
||
if ( $next_token =~ /^[msixpodualgc]/ ) {
|
||
$i++;
|
||
} # skip possible modifier
|
||
my ( $next_nonblank_token, $i_next ) =
|
||
find_next_nonblank_token( $i, $rtokens, $max_token_index );
|
||
|
||
if ( $pattern_test{$next_nonblank_token} ) {
|
||
$is_pattern = 1;
|
||
}
|
||
else {
|
||
|
||
if ( $next_nonblank_token =~ /^\s*$/ ) {
|
||
$is_pattern = 0;
|
||
}
|
||
else {
|
||
$is_pattern = -1;
|
||
}
|
||
}
|
||
return $is_pattern;
|
||
}
|
||
}
|
||
|
||
sub find_next_nonblank_token_on_this_line {
|
||
my ( $i, $rtokens, $max_token_index ) = @_;
|
||
my $next_nonblank_token;
|
||
|
||
if ( $i < $max_token_index ) {
|
||
$next_nonblank_token = $rtokens->[ ++$i ];
|
||
|
||
if ( $next_nonblank_token =~ /^\s*$/ ) {
|
||
|
||
if ( $i < $max_token_index ) {
|
||
$next_nonblank_token = $rtokens->[ ++$i ];
|
||
}
|
||
}
|
||
}
|
||
else {
|
||
$next_nonblank_token = "";
|
||
}
|
||
return ( $next_nonblank_token, $i );
|
||
}
|
||
|
||
sub find_angle_operator_termination {
|
||
|
||
# We are looking at a '<' and want to know if it is an angle operator.
|
||
# We are to return:
|
||
# $i = pretoken index of ending '>' if found, current $i otherwise
|
||
# $type = 'Q' if found, '>' otherwise
|
||
my ( $input_line, $i_beg, $rtoken_map, $expecting, $max_token_index ) = @_;
|
||
my $i = $i_beg;
|
||
my $type = '<';
|
||
pos($input_line) = 1 + $rtoken_map->[$i];
|
||
|
||
my $filter;
|
||
|
||
# we just have to find the next '>' if a term is expected
|
||
if ( $expecting == TERM ) { $filter = '[\>]' }
|
||
|
||
# we have to guess if we don't know what is expected
|
||
elsif ( $expecting == UNKNOWN ) { $filter = '[\>\;\=\#\|\<]' }
|
||
|
||
# shouldn't happen - we shouldn't be here if operator is expected
|
||
else { warning("Program Bug in find_angle_operator_termination\n") }
|
||
|
||
# To illustrate what we might be looking at, in case we are
|
||
# guessing, here are some examples of valid angle operators
|
||
# (or file globs):
|
||
# <tmp_imp/*>
|
||
# <FH>
|
||
# <$fh>
|
||
# <*.c *.h>
|
||
# <_>
|
||
# <jskdfjskdfj* op/* jskdjfjkosvk*> ( glob.t)
|
||
# <${PREFIX}*img*.$IMAGE_TYPE>
|
||
# <img*.$IMAGE_TYPE>
|
||
# <Timg*.$IMAGE_TYPE>
|
||
# <$LATEX2HTMLVERSIONS${dd}html[1-9].[0-9].pl>
|
||
#
|
||
# Here are some examples of lines which do not have angle operators:
|
||
# return unless $self->[2]++ < $#{$self->[1]};
|
||
# < 2 || @$t >
|
||
#
|
||
# the following line from dlister.pl caused trouble:
|
||
# print'~'x79,"\n",$D<1024?"0.$D":$D>>10,"K, $C files\n\n\n";
|
||
#
|
||
# If the '<' starts an angle operator, it must end on this line and
|
||
# it must not have certain characters like ';' and '=' in it. I use
|
||
# this to limit the testing. This filter should be improved if
|
||
# possible.
|
||
|
||
if ( $input_line =~ /($filter)/g ) {
|
||
|
||
if ( $1 eq '>' ) {
|
||
|
||
# We MAY have found an angle operator termination if we get
|
||
# here, but we need to do more to be sure we haven't been
|
||
# fooled.
|
||
my $pos = pos($input_line);
|
||
|
||
my $pos_beg = $rtoken_map->[$i];
|
||
my $str = substr( $input_line, $pos_beg, ( $pos - $pos_beg ) );
|
||
|
||
# Reject if the closing '>' follows a '-' as in:
|
||
# if ( VERSION < 5.009 && $op-> name eq 'assign' ) { }
|
||
if ( $expecting eq UNKNOWN ) {
|
||
my $check = substr( $input_line, $pos - 2, 1 );
|
||
if ( $check eq '-' ) {
|
||
return ( $i, $type );
|
||
}
|
||
}
|
||
|
||
######################################debug#####
|
||
#write_diagnostics( "ANGLE? :$str\n");
|
||
#print "ANGLE: found $1 at pos=$pos str=$str check=$check\n";
|
||
######################################debug#####
|
||
$type = 'Q';
|
||
my $error;
|
||
( $i, $error ) =
|
||
inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
|
||
|
||
# It may be possible that a quote ends midway in a pretoken.
|
||
# If this happens, it may be necessary to split the pretoken.
|
||
if ($error) {
|
||
warning(
|
||
"Possible tokinization error..please check this line\n");
|
||
report_possible_bug();
|
||
}
|
||
|
||
# count blanks on inside of brackets
|
||
my $blank_count = 0;
|
||
$blank_count++ if ( $str =~ /<\s+/ );
|
||
$blank_count++ if ( $str =~ /\s+>/ );
|
||
|
||
# Now let's see where we stand....
|
||
# OK if math op not possible
|
||
if ( $expecting == TERM ) {
|
||
}
|
||
|
||
# OK if there are no more than 2 non-blank pre-tokens inside
|
||
# (not possible to write 2 token math between < and >)
|
||
# This catches most common cases
|
||
elsif ( $i <= $i_beg + 3 + $blank_count ) {
|
||
|
||
# No longer any need to document this common case
|
||
## write_diagnostics("ANGLE(1 or 2 tokens): $str\n");
|
||
}
|
||
|
||
# OK if there is some kind of identifier inside
|
||
# print $fh <tvg::INPUT>;
|
||
elsif ( $str =~ /^<\s*\$?(\w|::|\s)+\s*>$/ ) {
|
||
write_diagnostics("ANGLE (contains identifier): $str\n");
|
||
}
|
||
|
||
# Not sure..
|
||
else {
|
||
|
||
# Let's try a Brace Test: any braces inside must balance
|
||
my $br = 0;
|
||
while ( $str =~ /\{/g ) { $br++ }
|
||
while ( $str =~ /\}/g ) { $br-- }
|
||
my $sb = 0;
|
||
while ( $str =~ /\[/g ) { $sb++ }
|
||
while ( $str =~ /\]/g ) { $sb-- }
|
||
my $pr = 0;
|
||
while ( $str =~ /\(/g ) { $pr++ }
|
||
while ( $str =~ /\)/g ) { $pr-- }
|
||
|
||
# if braces do not balance - not angle operator
|
||
if ( $br || $sb || $pr ) {
|
||
$i = $i_beg;
|
||
$type = '<';
|
||
write_diagnostics(
|
||
"NOT ANGLE (BRACE={$br ($pr [$sb ):$str\n");
|
||
}
|
||
|
||
# we should keep doing more checks here...to be continued
|
||
# Tentatively accepting this as a valid angle operator.
|
||
# There are lots more things that can be checked.
|
||
else {
|
||
write_diagnostics(
|
||
"ANGLE-Guessing yes: $str expecting=$expecting\n");
|
||
write_logfile_entry("Guessing angle operator here: $str\n");
|
||
}
|
||
}
|
||
}
|
||
|
||
# didn't find ending >
|
||
else {
|
||
if ( $expecting == TERM ) {
|
||
warning("No ending > for angle operator\n");
|
||
}
|
||
}
|
||
}
|
||
return ( $i, $type );
|
||
}
|
||
|
||
sub scan_number_do {
|
||
|
||
# scan a number in any of the formats that Perl accepts
|
||
# Underbars (_) are allowed in decimal numbers.
|
||
# input parameters -
|
||
# $input_line - the string to scan
|
||
# $i - pre_token index to start scanning
|
||
# $rtoken_map - reference to the pre_token map giving starting
|
||
# character position in $input_line of token $i
|
||
# output parameters -
|
||
# $i - last pre_token index of the number just scanned
|
||
# number - the number (characters); or undef if not a number
|
||
|
||
my ( $input_line, $i, $rtoken_map, $input_type, $max_token_index ) = @_;
|
||
my $pos_beg = $rtoken_map->[$i];
|
||
my $pos;
|
||
my $i_begin = $i;
|
||
my $number = undef;
|
||
my $type = $input_type;
|
||
|
||
my $first_char = substr( $input_line, $pos_beg, 1 );
|
||
|
||
# Look for bad starting characters; Shouldn't happen..
|
||
if ( $first_char !~ /[\d\.\+\-Ee]/ ) {
|
||
warning("Program bug - scan_number given character $first_char\n");
|
||
report_definite_bug();
|
||
return ( $i, $type, $number );
|
||
}
|
||
|
||
# handle v-string without leading 'v' character ('Two Dot' rule)
|
||
# (vstring.t)
|
||
# TODO: v-strings may contain underscores
|
||
pos($input_line) = $pos_beg;
|
||
if ( $input_line =~ /\G((\d+)?\.\d+(\.\d+)+)/g ) {
|
||
$pos = pos($input_line);
|
||
my $numc = $pos - $pos_beg;
|
||
$number = substr( $input_line, $pos_beg, $numc );
|
||
$type = 'v';
|
||
report_v_string($number);
|
||
}
|
||
|
||
# handle octal, hex, binary
|
||
if ( !defined($number) ) {
|
||
pos($input_line) = $pos_beg;
|
||
|
||
# Perl 5.22 added floating point literals, like '0x0.b17217f7d1cf78p0'
|
||
# For reference, the format prior to hex floating point is:
|
||
# /\G[+-]?0(([xX][0-9a-fA-F_]+)|([0-7_]+)|([bB][01_]+))/g )
|
||
# (hex) (octal) (binary)
|
||
if (
|
||
$input_line =~
|
||
|
||
/\G[+-]?0( # leading [signed] 0
|
||
|
||
# a hex float, i.e. '0x0.b17217f7d1cf78p0'
|
||
([xX][0-9a-fA-F_]* # X and optional leading digits
|
||
(\.([0-9a-fA-F][0-9a-fA-F_]*)?)? # optional decimal and fraction
|
||
[Pp][+-]?[0-9a-fA-F] # REQUIRED exponent with digit
|
||
[0-9a-fA-F_]*) # optional Additional exponent digits
|
||
|
||
# or hex integer
|
||
|([xX][0-9a-fA-F_]+)
|
||
|
||
# or octal fraction
|
||
|([0-7_]+ # string of octal digits
|
||
(\.([0-7][0-7_]*)?)? # optional decimal and fraction
|
||
[Pp][+-]?[0-7] # REQUIRED exponent, no underscore
|
||
[0-7_]*) # Additonal exponent digits, with underscores
|
||
|
||
# or octal integer
|
||
|([0-7_]+) # string of octal digits
|
||
|
||
# or a binary float
|
||
|([bB][01_]* # 'b' with string of binary digits
|
||
(\.([01][01_]*)?)? # optional decimal and fraction
|
||
[Pp][+-]?[01] # Required exponent indicator, no underscore
|
||
[01_]*) # additional exponent bits
|
||
|
||
# or binary integer
|
||
|([bB][01_]+) # 'b' with string of binary digits
|
||
|
||
)/gx
|
||
)
|
||
{
|
||
$pos = pos($input_line);
|
||
my $numc = $pos - $pos_beg;
|
||
$number = substr( $input_line, $pos_beg, $numc );
|
||
$type = 'n';
|
||
}
|
||
}
|
||
|
||
# handle decimal
|
||
if ( !defined($number) ) {
|
||
pos($input_line) = $pos_beg;
|
||
|
||
if ( $input_line =~ /\G([+-]?[\d_]*(\.[\d_]*)?([Ee][+-]?(\d+))?)/g ) {
|
||
$pos = pos($input_line);
|
||
|
||
# watch out for things like 0..40 which would give 0. by this;
|
||
if ( ( substr( $input_line, $pos - 1, 1 ) eq '.' )
|
||
&& ( substr( $input_line, $pos, 1 ) eq '.' ) )
|
||
{
|
||
$pos--;
|
||
}
|
||
my $numc = $pos - $pos_beg;
|
||
$number = substr( $input_line, $pos_beg, $numc );
|
||
$type = 'n';
|
||
}
|
||
}
|
||
|
||
# filter out non-numbers like e + - . e2 .e3 +e6
|
||
# the rule: at least one digit, and any 'e' must be preceded by a digit
|
||
if (
|
||
$number !~ /\d/ # no digits
|
||
|| ( $number =~ /^(.*)[eE]/
|
||
&& $1 !~ /\d/ ) # or no digits before the 'e'
|
||
)
|
||
{
|
||
$number = undef;
|
||
$type = $input_type;
|
||
return ( $i, $type, $number );
|
||
}
|
||
|
||
# Found a number; now we must convert back from character position
|
||
# to pre_token index. An error here implies user syntax error.
|
||
# An example would be an invalid octal number like '009'.
|
||
my $error;
|
||
( $i, $error ) =
|
||
inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
|
||
if ($error) { warning("Possibly invalid number\n") }
|
||
|
||
return ( $i, $type, $number );
|
||
}
|
||
|
||
sub inverse_pretoken_map {
|
||
|
||
# Starting with the current pre_token index $i, scan forward until
|
||
# finding the index of the next pre_token whose position is $pos.
|
||
my ( $i, $pos, $rtoken_map, $max_token_index ) = @_;
|
||
my $error = 0;
|
||
|
||
while ( ++$i <= $max_token_index ) {
|
||
|
||
if ( $pos <= $rtoken_map->[$i] ) {
|
||
|
||
# Let the calling routine handle errors in which we do not
|
||
# land on a pre-token boundary. It can happen by running
|
||
# perltidy on some non-perl scripts, for example.
|
||
if ( $pos < $rtoken_map->[$i] ) { $error = 1 }
|
||
$i--;
|
||
last;
|
||
}
|
||
}
|
||
return ( $i, $error );
|
||
}
|
||
|
||
sub find_here_doc {
|
||
|
||
# find the target of a here document, if any
|
||
# input parameters:
|
||
# $i - token index of the second < of <<
|
||
# ($i must be less than the last token index if this is called)
|
||
# output parameters:
|
||
# $found_target = 0 didn't find target; =1 found target
|
||
# HERE_TARGET - the target string (may be empty string)
|
||
# $i - unchanged if not here doc,
|
||
# or index of the last token of the here target
|
||
# $saw_error - flag noting unbalanced quote on here target
|
||
my ( $expecting, $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
|
||
my $ibeg = $i;
|
||
my $found_target = 0;
|
||
my $here_doc_target = '';
|
||
my $here_quote_character = '';
|
||
my $saw_error = 0;
|
||
my ( $next_nonblank_token, $i_next_nonblank, $next_token );
|
||
$next_token = $rtokens->[ $i + 1 ];
|
||
|
||
# perl allows a backslash before the target string (heredoc.t)
|
||
my $backslash = 0;
|
||
if ( $next_token eq '\\' ) {
|
||
$backslash = 1;
|
||
$next_token = $rtokens->[ $i + 2 ];
|
||
}
|
||
|
||
( $next_nonblank_token, $i_next_nonblank ) =
|
||
find_next_nonblank_token_on_this_line( $i, $rtokens, $max_token_index );
|
||
|
||
if ( $next_nonblank_token =~ /[\'\"\`]/ ) {
|
||
|
||
my $in_quote = 1;
|
||
my $quote_depth = 0;
|
||
my $quote_pos = 0;
|
||
my $quoted_string;
|
||
|
||
(
|
||
$i, $in_quote, $here_quote_character, $quote_pos, $quote_depth,
|
||
$quoted_string
|
||
)
|
||
= follow_quoted_string( $i_next_nonblank, $in_quote, $rtokens,
|
||
$here_quote_character, $quote_pos, $quote_depth, $max_token_index );
|
||
|
||
if ($in_quote) { # didn't find end of quote, so no target found
|
||
$i = $ibeg;
|
||
if ( $expecting == TERM ) {
|
||
warning(
|
||
"Did not find here-doc string terminator ($here_quote_character) before end of line \n"
|
||
);
|
||
$saw_error = 1;
|
||
}
|
||
}
|
||
else { # found ending quote
|
||
$found_target = 1;
|
||
|
||
my $tokj;
|
||
foreach my $j ( $i_next_nonblank + 1 .. $i - 1 ) {
|
||
$tokj = $rtokens->[$j];
|
||
|
||
# we have to remove any backslash before the quote character
|
||
# so that the here-doc-target exactly matches this string
|
||
next
|
||
if ( $tokj eq "\\"
|
||
&& $j < $i - 1
|
||
&& $rtokens->[ $j + 1 ] eq $here_quote_character );
|
||
$here_doc_target .= $tokj;
|
||
}
|
||
}
|
||
}
|
||
|
||
elsif ( ( $next_token =~ /^\s*$/ ) and ( $expecting == TERM ) ) {
|
||
$found_target = 1;
|
||
write_logfile_entry(
|
||
"found blank here-target after <<; suggest using \"\"\n");
|
||
$i = $ibeg;
|
||
}
|
||
elsif ( $next_token =~ /^\w/ ) { # simple bareword or integer after <<
|
||
|
||
my $here_doc_expected;
|
||
if ( $expecting == UNKNOWN ) {
|
||
$here_doc_expected = guess_if_here_doc($next_token);
|
||
}
|
||
else {
|
||
$here_doc_expected = 1;
|
||
}
|
||
|
||
if ($here_doc_expected) {
|
||
$found_target = 1;
|
||
$here_doc_target = $next_token;
|
||
$i = $ibeg + 1;
|
||
}
|
||
|
||
}
|
||
else {
|
||
|
||
if ( $expecting == TERM ) {
|
||
$found_target = 1;
|
||
write_logfile_entry("Note: bare here-doc operator <<\n");
|
||
}
|
||
else {
|
||
$i = $ibeg;
|
||
}
|
||
}
|
||
|
||
# patch to neglect any prepended backslash
|
||
if ( $found_target && $backslash ) { $i++ }
|
||
|
||
return ( $found_target, $here_doc_target, $here_quote_character, $i,
|
||
$saw_error );
|
||
}
|
||
|
||
sub do_quote {
|
||
|
||
# follow (or continue following) quoted string(s)
|
||
# $in_quote return code:
|
||
# 0 - ok, found end
|
||
# 1 - still must find end of quote whose target is $quote_character
|
||
# 2 - still looking for end of first of two quotes
|
||
#
|
||
# Returns updated strings:
|
||
# $quoted_string_1 = quoted string seen while in_quote=1
|
||
# $quoted_string_2 = quoted string seen while in_quote=2
|
||
my (
|
||
$i, $in_quote, $quote_character,
|
||
$quote_pos, $quote_depth, $quoted_string_1,
|
||
$quoted_string_2, $rtokens, $rtoken_map,
|
||
$max_token_index
|
||
) = @_;
|
||
|
||
my $in_quote_starting = $in_quote;
|
||
|
||
my $quoted_string;
|
||
if ( $in_quote == 2 ) { # two quotes/quoted_string_1s to follow
|
||
my $ibeg = $i;
|
||
(
|
||
$i, $in_quote, $quote_character, $quote_pos, $quote_depth,
|
||
$quoted_string
|
||
)
|
||
= follow_quoted_string( $i, $in_quote, $rtokens, $quote_character,
|
||
$quote_pos, $quote_depth, $max_token_index );
|
||
$quoted_string_2 .= $quoted_string;
|
||
if ( $in_quote == 1 ) {
|
||
if ( $quote_character =~ /[\{\[\<\(]/ ) { $i++; }
|
||
$quote_character = '';
|
||
}
|
||
else {
|
||
$quoted_string_2 .= "\n";
|
||
}
|
||
}
|
||
|
||
if ( $in_quote == 1 ) { # one (more) quote to follow
|
||
my $ibeg = $i;
|
||
(
|
||
$i, $in_quote, $quote_character, $quote_pos, $quote_depth,
|
||
$quoted_string
|
||
)
|
||
= follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
|
||
$quote_pos, $quote_depth, $max_token_index );
|
||
$quoted_string_1 .= $quoted_string;
|
||
if ( $in_quote == 1 ) {
|
||
$quoted_string_1 .= "\n";
|
||
}
|
||
}
|
||
return ( $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
|
||
$quoted_string_1, $quoted_string_2 );
|
||
}
|
||
|
||
sub follow_quoted_string {
|
||
|
||
# scan for a specific token, skipping escaped characters
|
||
# if the quote character is blank, use the first non-blank character
|
||
# input parameters:
|
||
# $rtokens = reference to the array of tokens
|
||
# $i = the token index of the first character to search
|
||
# $in_quote = number of quoted strings being followed
|
||
# $beginning_tok = the starting quote character
|
||
# $quote_pos = index to check next for alphanumeric delimiter
|
||
# output parameters:
|
||
# $i = the token index of the ending quote character
|
||
# $in_quote = decremented if found end, unchanged if not
|
||
# $beginning_tok = the starting quote character
|
||
# $quote_pos = index to check next for alphanumeric delimiter
|
||
# $quote_depth = nesting depth, since delimiters '{ ( [ <' can be nested.
|
||
# $quoted_string = the text of the quote (without quotation tokens)
|
||
my ( $i_beg, $in_quote, $rtokens, $beginning_tok, $quote_pos, $quote_depth,
|
||
$max_token_index )
|
||
= @_;
|
||
my ( $tok, $end_tok );
|
||
my $i = $i_beg - 1;
|
||
my $quoted_string = "";
|
||
|
||
0 && do {
|
||
print STDOUT
|
||
"QUOTE entering with quote_pos = $quote_pos i=$i beginning_tok =$beginning_tok\n";
|
||
};
|
||
|
||
# get the corresponding end token
|
||
if ( $beginning_tok !~ /^\s*$/ ) {
|
||
$end_tok = matching_end_token($beginning_tok);
|
||
}
|
||
|
||
# a blank token means we must find and use the first non-blank one
|
||
else {
|
||
my $allow_quote_comments = ( $i < 0 ) ? 1 : 0; # i<0 means we saw a <cr>
|
||
|
||
while ( $i < $max_token_index ) {
|
||
$tok = $rtokens->[ ++$i ];
|
||
|
||
if ( $tok !~ /^\s*$/ ) {
|
||
|
||
if ( ( $tok eq '#' ) && ($allow_quote_comments) ) {
|
||
$i = $max_token_index;
|
||
}
|
||
else {
|
||
|
||
if ( length($tok) > 1 ) {
|
||
if ( $quote_pos <= 0 ) { $quote_pos = 1 }
|
||
$beginning_tok = substr( $tok, $quote_pos - 1, 1 );
|
||
}
|
||
else {
|
||
$beginning_tok = $tok;
|
||
$quote_pos = 0;
|
||
}
|
||
$end_tok = matching_end_token($beginning_tok);
|
||
$quote_depth = 1;
|
||
last;
|
||
}
|
||
}
|
||
else {
|
||
$allow_quote_comments = 1;
|
||
}
|
||
}
|
||
}
|
||
|
||
# There are two different loops which search for the ending quote
|
||
# character. In the rare case of an alphanumeric quote delimiter, we
|
||
# have to look through alphanumeric tokens character-by-character, since
|
||
# the pre-tokenization process combines multiple alphanumeric
|
||
# characters, whereas for a non-alphanumeric delimiter, only tokens of
|
||
# length 1 can match.
|
||
|
||
###################################################################
|
||
# Case 1 (rare): loop for case of alphanumeric quote delimiter..
|
||
# "quote_pos" is the position the current word to begin searching
|
||
###################################################################
|
||
if ( $beginning_tok =~ /\w/ ) {
|
||
|
||
# Note this because it is not recommended practice except
|
||
# for obfuscated perl contests
|
||
if ( $in_quote == 1 ) {
|
||
write_logfile_entry(
|
||
"Note: alphanumeric quote delimiter ($beginning_tok) \n");
|
||
}
|
||
|
||
while ( $i < $max_token_index ) {
|
||
|
||
if ( $quote_pos == 0 || ( $i < 0 ) ) {
|
||
$tok = $rtokens->[ ++$i ];
|
||
|
||
if ( $tok eq '\\' ) {
|
||
|
||
# retain backslash unless it hides the end token
|
||
$quoted_string .= $tok
|
||
unless $rtokens->[ $i + 1 ] eq $end_tok;
|
||
$quote_pos++;
|
||
last if ( $i >= $max_token_index );
|
||
$tok = $rtokens->[ ++$i ];
|
||
}
|
||
}
|
||
my $old_pos = $quote_pos;
|
||
|
||
unless ( defined($tok) && defined($end_tok) && defined($quote_pos) )
|
||
{
|
||
|
||
}
|
||
$quote_pos = 1 + index( $tok, $end_tok, $quote_pos );
|
||
|
||
if ( $quote_pos > 0 ) {
|
||
|
||
$quoted_string .=
|
||
substr( $tok, $old_pos, $quote_pos - $old_pos - 1 );
|
||
|
||
$quote_depth--;
|
||
|
||
if ( $quote_depth == 0 ) {
|
||
$in_quote--;
|
||
last;
|
||
}
|
||
}
|
||
else {
|
||
if ( $old_pos <= length($tok) ) {
|
||
$quoted_string .= substr( $tok, $old_pos );
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
########################################################################
|
||
# Case 2 (normal): loop for case of a non-alphanumeric quote delimiter..
|
||
########################################################################
|
||
else {
|
||
|
||
while ( $i < $max_token_index ) {
|
||
$tok = $rtokens->[ ++$i ];
|
||
|
||
if ( $tok eq $end_tok ) {
|
||
$quote_depth--;
|
||
|
||
if ( $quote_depth == 0 ) {
|
||
$in_quote--;
|
||
last;
|
||
}
|
||
}
|
||
elsif ( $tok eq $beginning_tok ) {
|
||
$quote_depth++;
|
||
}
|
||
elsif ( $tok eq '\\' ) {
|
||
|
||
# retain backslash unless it hides the beginning or end token
|
||
$tok = $rtokens->[ ++$i ];
|
||
$quoted_string .= '\\'
|
||
unless ( $tok eq $end_tok || $tok eq $beginning_tok );
|
||
}
|
||
$quoted_string .= $tok;
|
||
}
|
||
}
|
||
if ( $i > $max_token_index ) { $i = $max_token_index }
|
||
return ( $i, $in_quote, $beginning_tok, $quote_pos, $quote_depth,
|
||
$quoted_string );
|
||
}
|
||
|
||
sub indicate_error {
|
||
my ( $msg, $line_number, $input_line, $pos, $carrat ) = @_;
|
||
interrupt_logfile();
|
||
warning($msg);
|
||
write_error_indicator_pair( $line_number, $input_line, $pos, $carrat );
|
||
resume_logfile();
|
||
return;
|
||
}
|
||
|
||
sub write_error_indicator_pair {
|
||
my ( $line_number, $input_line, $pos, $carrat ) = @_;
|
||
my ( $offset, $numbered_line, $underline ) =
|
||
make_numbered_line( $line_number, $input_line, $pos );
|
||
$underline = write_on_underline( $underline, $pos - $offset, $carrat );
|
||
warning( $numbered_line . "\n" );
|
||
$underline =~ s/\s*$//;
|
||
warning( $underline . "\n" );
|
||
return;
|
||
}
|
||
|
||
sub make_numbered_line {
|
||
|
||
# Given an input line, its line number, and a character position of
|
||
# interest, create a string not longer than 80 characters of the form
|
||
# $lineno: sub_string
|
||
# such that the sub_string of $str contains the position of interest
|
||
#
|
||
# Here is an example of what we want, in this case we add trailing
|
||
# '...' because the line is long.
|
||
#
|
||
# 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
|
||
#
|
||
# Here is another example, this time in which we used leading '...'
|
||
# because of excessive length:
|
||
#
|
||
# 2: ... er of the World Wide Web Consortium's
|
||
#
|
||
# input parameters are:
|
||
# $lineno = line number
|
||
# $str = the text of the line
|
||
# $pos = position of interest (the error) : 0 = first character
|
||
#
|
||
# We return :
|
||
# - $offset = an offset which corrects the position in case we only
|
||
# display part of a line, such that $pos-$offset is the effective
|
||
# position from the start of the displayed line.
|
||
# - $numbered_line = the numbered line as above,
|
||
# - $underline = a blank 'underline' which is all spaces with the same
|
||
# number of characters as the numbered line.
|
||
|
||
my ( $lineno, $str, $pos ) = @_;
|
||
my $offset = ( $pos < 60 ) ? 0 : $pos - 40;
|
||
my $excess = length($str) - $offset - 68;
|
||
my $numc = ( $excess > 0 ) ? 68 : undef;
|
||
|
||
if ( defined($numc) ) {
|
||
if ( $offset == 0 ) {
|
||
$str = substr( $str, $offset, $numc - 4 ) . " ...";
|
||
}
|
||
else {
|
||
$str = "... " . substr( $str, $offset + 4, $numc - 4 ) . " ...";
|
||
}
|
||
}
|
||
else {
|
||
|
||
if ( $offset == 0 ) {
|
||
}
|
||
else {
|
||
$str = "... " . substr( $str, $offset + 4 );
|
||
}
|
||
}
|
||
|
||
my $numbered_line = sprintf( "%d: ", $lineno );
|
||
$offset -= length($numbered_line);
|
||
$numbered_line .= $str;
|
||
my $underline = " " x length($numbered_line);
|
||
return ( $offset, $numbered_line, $underline );
|
||
}
|
||
|
||
sub write_on_underline {
|
||
|
||
# The "underline" is a string that shows where an error is; it starts
|
||
# out as a string of blanks with the same length as the numbered line of
|
||
# code above it, and we have to add marking to show where an error is.
|
||
# In the example below, we want to write the string '--^' just below
|
||
# the line of bad code:
|
||
#
|
||
# 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
|
||
# ---^
|
||
# We are given the current underline string, plus a position and a
|
||
# string to write on it.
|
||
#
|
||
# In the above example, there will be 2 calls to do this:
|
||
# First call: $pos=19, pos_chr=^
|
||
# Second call: $pos=16, pos_chr=---
|
||
#
|
||
# This is a trivial thing to do with substr, but there is some
|
||
# checking to do.
|
||
|
||
my ( $underline, $pos, $pos_chr ) = @_;
|
||
|
||
# check for error..shouldn't happen
|
||
unless ( ( $pos >= 0 ) && ( $pos <= length($underline) ) ) {
|
||
return $underline;
|
||
}
|
||
my $excess = length($pos_chr) + $pos - length($underline);
|
||
if ( $excess > 0 ) {
|
||
$pos_chr = substr( $pos_chr, 0, length($pos_chr) - $excess );
|
||
}
|
||
substr( $underline, $pos, length($pos_chr) ) = $pos_chr;
|
||
return ($underline);
|
||
}
|
||
|
||
sub pre_tokenize {
|
||
|
||
# Break a string, $str, into a sequence of preliminary tokens. We
|
||
# are interested in these types of tokens:
|
||
# words (type='w'), example: 'max_tokens_wanted'
|
||
# digits (type = 'd'), example: '0755'
|
||
# whitespace (type = 'b'), example: ' '
|
||
# any other single character (i.e. punct; type = the character itself).
|
||
# We cannot do better than this yet because we might be in a quoted
|
||
# string or pattern. Caller sets $max_tokens_wanted to 0 to get all
|
||
# tokens.
|
||
my ( $str, $max_tokens_wanted ) = @_;
|
||
|
||
# we return references to these 3 arrays:
|
||
my @tokens = (); # array of the tokens themselves
|
||
my @token_map = (0); # string position of start of each token
|
||
my @type = (); # 'b'=whitespace, 'd'=digits, 'w'=alpha, or punct
|
||
|
||
do {
|
||
|
||
# whitespace
|
||
if ( $str =~ /\G(\s+)/gc ) { push @type, 'b'; }
|
||
|
||
# numbers
|
||
# note that this must come before words!
|
||
elsif ( $str =~ /\G(\d+)/gc ) { push @type, 'd'; }
|
||
|
||
# words
|
||
elsif ( $str =~ /\G(\w+)/gc ) { push @type, 'w'; }
|
||
|
||
# single-character punctuation
|
||
elsif ( $str =~ /\G(\W)/gc ) { push @type, $1; }
|
||
|
||
# that's all..
|
||
else {
|
||
return ( \@tokens, \@token_map, \@type );
|
||
}
|
||
|
||
push @tokens, $1;
|
||
push @token_map, pos($str);
|
||
|
||
} while ( --$max_tokens_wanted != 0 );
|
||
|
||
return ( \@tokens, \@token_map, \@type );
|
||
}
|
||
|
||
sub show_tokens {
|
||
|
||
# this is an old debug routine
|
||
# not called, but saved for reference
|
||
my ( $rtokens, $rtoken_map ) = @_;
|
||
my $num = scalar( @{$rtokens} );
|
||
|
||
foreach my $i ( 0 .. $num - 1 ) {
|
||
my $len = length( $rtokens->[$i] );
|
||
print STDOUT "$i:$len:$rtoken_map->[$i]:$rtokens->[$i]:\n";
|
||
}
|
||
return;
|
||
}
|
||
|
||
{ ## closure for sub matching end token
|
||
my %matching_end_token;
|
||
|
||
BEGIN {
|
||
%matching_end_token = (
|
||
'{' => '}',
|
||
'(' => ')',
|
||
'[' => ']',
|
||
'<' => '>',
|
||
);
|
||
}
|
||
|
||
sub matching_end_token {
|
||
|
||
# return closing character for a pattern
|
||
my $beginning_token = shift;
|
||
if ( $matching_end_token{$beginning_token} ) {
|
||
return $matching_end_token{$beginning_token};
|
||
}
|
||
return ($beginning_token);
|
||
}
|
||
}
|
||
|
||
sub dump_token_types {
|
||
my ( $class, $fh ) = @_;
|
||
|
||
# This should be the latest list of token types in use
|
||
# adding NEW_TOKENS: add a comment here
|
||
$fh->print(<<'END_OF_LIST');
|
||
|
||
Here is a list of the token types currently used for lines of type 'CODE'.
|
||
For the following tokens, the "type" of a token is just the token itself.
|
||
|
||
.. :: << >> ** && .. || // -> => += -= .= %= &= |= ^= *= <>
|
||
( ) <= >= == =~ !~ != ++ -- /= x=
|
||
... **= <<= >>= &&= ||= //= <=>
|
||
, + - / * | % ! x ~ = \ ? : . < > ^ &
|
||
|
||
The following additional token types are defined:
|
||
|
||
type meaning
|
||
b blank (white space)
|
||
{ indent: opening structural curly brace or square bracket or paren
|
||
(code block, anonymous hash reference, or anonymous array reference)
|
||
} outdent: right structural curly brace or square bracket or paren
|
||
[ left non-structural square bracket (enclosing an array index)
|
||
] right non-structural square bracket
|
||
( left non-structural paren (all but a list right of an =)
|
||
) right non-structural paren
|
||
L left non-structural curly brace (enclosing a key)
|
||
R right non-structural curly brace
|
||
; terminal semicolon
|
||
f indicates a semicolon in a "for" statement
|
||
h here_doc operator <<
|
||
# a comment
|
||
Q indicates a quote or pattern
|
||
q indicates a qw quote block
|
||
k a perl keyword
|
||
C user-defined constant or constant function (with void prototype = ())
|
||
U user-defined function taking parameters
|
||
G user-defined function taking block parameter (like grep/map/eval)
|
||
M (unused, but reserved for subroutine definition name)
|
||
P (unused, but -html uses it to label pod text)
|
||
t type indicater such as %,$,@,*,&,sub
|
||
w bare word (perhaps a subroutine call)
|
||
i identifier of some type (with leading %, $, @, *, &, sub, -> )
|
||
n a number
|
||
v a v-string
|
||
F a file test operator (like -e)
|
||
Y File handle
|
||
Z identifier in indirect object slot: may be file handle, object
|
||
J LABEL: code block label
|
||
j LABEL after next, last, redo, goto
|
||
p unary +
|
||
m unary -
|
||
pp pre-increment operator ++
|
||
mm pre-decrement operator --
|
||
A : used as attribute separator
|
||
|
||
Here are the '_line_type' codes used internally:
|
||
SYSTEM - system-specific code before hash-bang line
|
||
CODE - line of perl code (including comments)
|
||
POD_START - line starting pod, such as '=head'
|
||
POD - pod documentation text
|
||
POD_END - last line of pod section, '=cut'
|
||
HERE - text of here-document
|
||
HERE_END - last line of here-doc (target word)
|
||
FORMAT - format section
|
||
FORMAT_END - last line of format section, '.'
|
||
DATA_START - __DATA__ line
|
||
DATA - unidentified text following __DATA__
|
||
END_START - __END__ line
|
||
END - unidentified text following __END__
|
||
ERROR - we are in big trouble, probably not a perl script
|
||
END_OF_LIST
|
||
|
||
return;
|
||
}
|
||
|
||
BEGIN {
|
||
|
||
# These names are used in error messages
|
||
@opening_brace_names = qw# '{' '[' '(' '?' #;
|
||
@closing_brace_names = qw# '}' ']' ')' ':' #;
|
||
|
||
my @q;
|
||
|
||
my @digraphs = qw(
|
||
.. :: << >> ** && .. || // -> => += -= .= %= &= |= ^= *= <>
|
||
<= >= == =~ !~ != ++ -- /= x= ~~ ~. |. &. ^.
|
||
);
|
||
@is_digraph{@digraphs} = (1) x scalar(@digraphs);
|
||
|
||
my @trigraphs = qw( ... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~);
|
||
@is_trigraph{@trigraphs} = (1) x scalar(@trigraphs);
|
||
|
||
my @tetragraphs = qw( <<>> );
|
||
@is_tetragraph{@tetragraphs} = (1) x scalar(@tetragraphs);
|
||
|
||
# make a hash of all valid token types for self-checking the tokenizer
|
||
# (adding NEW_TOKENS : select a new character and add to this list)
|
||
my @valid_token_types = qw#
|
||
A b C G L R f h Q k t w i q n p m F pp mm U j J Y Z v
|
||
{ } ( ) [ ] ; + - / * | % ! x ~ = \ ? : . < > ^ &
|
||
#;
|
||
push( @valid_token_types, @digraphs );
|
||
push( @valid_token_types, @trigraphs );
|
||
push( @valid_token_types, @tetragraphs );
|
||
push( @valid_token_types, ( '#', ',', 'CORE::' ) );
|
||
@is_valid_token_type{@valid_token_types} = (1) x scalar(@valid_token_types);
|
||
|
||
# a list of file test letters, as in -e (Table 3-4 of 'camel 3')
|
||
my @file_test_operators =
|
||
qw( A B C M O R S T W X b c d e f g k l o p r s t u w x z);
|
||
@is_file_test_operator{@file_test_operators} =
|
||
(1) x scalar(@file_test_operators);
|
||
|
||
# these functions have prototypes of the form (&), so when they are
|
||
# followed by a block, that block MAY BE followed by an operator.
|
||
# Smartmatch operator ~~ may be followed by anonymous hash or array ref
|
||
@q = qw( do eval );
|
||
@is_block_operator{@q} = (1) x scalar(@q);
|
||
|
||
# these functions allow an identifier in the indirect object slot
|
||
@q = qw( print printf sort exec system say);
|
||
@is_indirect_object_taker{@q} = (1) x scalar(@q);
|
||
|
||
# These tokens may precede a code block
|
||
# patched for SWITCH/CASE/CATCH. Actually these could be removed
|
||
# now and we could let the extended-syntax coding handle them.
|
||
# Added 'default' for Switch::Plain.
|
||
@q =
|
||
qw( BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue if elsif else
|
||
unless do while until eval for foreach map grep sort
|
||
switch case given when default catch try finally);
|
||
@is_code_block_token{@q} = (1) x scalar(@q);
|
||
|
||
# I'll build the list of keywords incrementally
|
||
my @Keywords = ();
|
||
|
||
# keywords and tokens after which a value or pattern is expected,
|
||
# but not an operator. In other words, these should consume terms
|
||
# to their right, or at least they are not expected to be followed
|
||
# immediately by operators.
|
||
my @value_requestor = qw(
|
||
AUTOLOAD
|
||
BEGIN
|
||
CHECK
|
||
DESTROY
|
||
END
|
||
EQ
|
||
GE
|
||
GT
|
||
INIT
|
||
LE
|
||
LT
|
||
NE
|
||
UNITCHECK
|
||
abs
|
||
accept
|
||
alarm
|
||
and
|
||
atan2
|
||
bind
|
||
binmode
|
||
bless
|
||
break
|
||
caller
|
||
chdir
|
||
chmod
|
||
chomp
|
||
chop
|
||
chown
|
||
chr
|
||
chroot
|
||
close
|
||
closedir
|
||
cmp
|
||
connect
|
||
continue
|
||
cos
|
||
crypt
|
||
dbmclose
|
||
dbmopen
|
||
defined
|
||
delete
|
||
die
|
||
dump
|
||
each
|
||
else
|
||
elsif
|
||
eof
|
||
eq
|
||
evalbytes
|
||
exec
|
||
exists
|
||
exit
|
||
exp
|
||
fc
|
||
fcntl
|
||
fileno
|
||
flock
|
||
for
|
||
foreach
|
||
formline
|
||
ge
|
||
getc
|
||
getgrgid
|
||
getgrnam
|
||
gethostbyaddr
|
||
gethostbyname
|
||
getnetbyaddr
|
||
getnetbyname
|
||
getpeername
|
||
getpgrp
|
||
getpriority
|
||
getprotobyname
|
||
getprotobynumber
|
||
getpwnam
|
||
getpwuid
|
||
getservbyname
|
||
getservbyport
|
||
getsockname
|
||
getsockopt
|
||
glob
|
||
gmtime
|
||
goto
|
||
grep
|
||
gt
|
||
hex
|
||
if
|
||
index
|
||
int
|
||
ioctl
|
||
join
|
||
keys
|
||
kill
|
||
last
|
||
lc
|
||
lcfirst
|
||
le
|
||
length
|
||
link
|
||
listen
|
||
local
|
||
localtime
|
||
lock
|
||
log
|
||
lstat
|
||
lt
|
||
map
|
||
mkdir
|
||
msgctl
|
||
msgget
|
||
msgrcv
|
||
msgsnd
|
||
my
|
||
ne
|
||
next
|
||
no
|
||
not
|
||
oct
|
||
open
|
||
opendir
|
||
or
|
||
ord
|
||
our
|
||
pack
|
||
pipe
|
||
pop
|
||
pos
|
||
print
|
||
printf
|
||
prototype
|
||
push
|
||
quotemeta
|
||
rand
|
||
read
|
||
readdir
|
||
readlink
|
||
readline
|
||
readpipe
|
||
recv
|
||
redo
|
||
ref
|
||
rename
|
||
require
|
||
reset
|
||
return
|
||
reverse
|
||
rewinddir
|
||
rindex
|
||
rmdir
|
||
scalar
|
||
seek
|
||
seekdir
|
||
select
|
||
semctl
|
||
semget
|
||
semop
|
||
send
|
||
sethostent
|
||
setnetent
|
||
setpgrp
|
||
setpriority
|
||
setprotoent
|
||
setservent
|
||
setsockopt
|
||
shift
|
||
shmctl
|
||
shmget
|
||
shmread
|
||
shmwrite
|
||
shutdown
|
||
sin
|
||
sleep
|
||
socket
|
||
socketpair
|
||
sort
|
||
splice
|
||
split
|
||
sprintf
|
||
sqrt
|
||
srand
|
||
stat
|
||
state
|
||
study
|
||
substr
|
||
symlink
|
||
syscall
|
||
sysopen
|
||
sysread
|
||
sysseek
|
||
system
|
||
syswrite
|
||
tell
|
||
telldir
|
||
tie
|
||
tied
|
||
truncate
|
||
uc
|
||
ucfirst
|
||
umask
|
||
undef
|
||
unless
|
||
unlink
|
||
unpack
|
||
unshift
|
||
untie
|
||
until
|
||
use
|
||
utime
|
||
values
|
||
vec
|
||
waitpid
|
||
warn
|
||
while
|
||
write
|
||
xor
|
||
|
||
switch
|
||
case
|
||
default
|
||
given
|
||
when
|
||
err
|
||
say
|
||
isa
|
||
|
||
catch
|
||
);
|
||
|
||
# patched above for SWITCH/CASE given/when err say
|
||
# 'err' is a fairly safe addition.
|
||
# Added 'default' for Switch::Plain. Note that we could also have
|
||
# a separate set of keywords to include if we see 'use Switch::Plain'
|
||
push( @Keywords, @value_requestor );
|
||
|
||
# These are treated the same but are not keywords:
|
||
my @extra_vr = qw(
|
||
constant
|
||
vars
|
||
);
|
||
push( @value_requestor, @extra_vr );
|
||
|
||
@expecting_term_token{@value_requestor} = (1) x scalar(@value_requestor);
|
||
|
||
# this list contains keywords which do not look for arguments,
|
||
# so that they might be followed by an operator, or at least
|
||
# not a term.
|
||
my @operator_requestor = qw(
|
||
endgrent
|
||
endhostent
|
||
endnetent
|
||
endprotoent
|
||
endpwent
|
||
endservent
|
||
fork
|
||
getgrent
|
||
gethostent
|
||
getlogin
|
||
getnetent
|
||
getppid
|
||
getprotoent
|
||
getpwent
|
||
getservent
|
||
setgrent
|
||
setpwent
|
||
time
|
||
times
|
||
wait
|
||
wantarray
|
||
);
|
||
|
||
push( @Keywords, @operator_requestor );
|
||
|
||
# These are treated the same but are not considered keywords:
|
||
my @extra_or = qw(
|
||
STDERR
|
||
STDIN
|
||
STDOUT
|
||
);
|
||
|
||
push( @operator_requestor, @extra_or );
|
||
|
||
@expecting_operator_token{@operator_requestor} =
|
||
(1) x scalar(@operator_requestor);
|
||
|
||
# these token TYPES expect trailing operator but not a term
|
||
# note: ++ and -- are post-increment and decrement, 'C' = constant
|
||
my @operator_requestor_types = qw( ++ -- C <> q );
|
||
@expecting_operator_types{@operator_requestor_types} =
|
||
(1) x scalar(@operator_requestor_types);
|
||
|
||
# these token TYPES consume values (terms)
|
||
# note: pp and mm are pre-increment and decrement
|
||
# f=semicolon in for, F=file test operator
|
||
my @value_requestor_type = qw#
|
||
L { ( [ ~ !~ =~ ; . .. ... A : && ! || // = + - x
|
||
**= += -= .= /= *= %= x= &= |= ^= <<= >>= &&= ||= //=
|
||
<= >= == != => \ > < % * / ? & | ** <=> ~~ !~~ <<~
|
||
f F pp mm Y p m U J G j >> << ^ t
|
||
~. ^. |. &. ^.= |.= &.=
|
||
#;
|
||
push( @value_requestor_type, ',' )
|
||
; # (perl doesn't like a ',' in a qw block)
|
||
@expecting_term_types{@value_requestor_type} =
|
||
(1) x scalar(@value_requestor_type);
|
||
|
||
# Note: the following valid token types are not assigned here to
|
||
# hashes requesting to be followed by values or terms, but are
|
||
# instead currently hard-coded into sub operator_expected:
|
||
# ) -> :: Q R Z ] b h i k n v w } #
|
||
|
||
# For simple syntax checking, it is nice to have a list of operators which
|
||
# will really be unhappy if not followed by a term. This includes most
|
||
# of the above...
|
||
%really_want_term = %expecting_term_types;
|
||
|
||
# with these exceptions...
|
||
delete $really_want_term{'U'}; # user sub, depends on prototype
|
||
delete $really_want_term{'F'}; # file test works on $_ if no following term
|
||
delete $really_want_term{'Y'}; # indirect object, too risky to check syntax;
|
||
# let perl do it
|
||
|
||
@q = qw(q qq qw qx qr s y tr m);
|
||
@is_q_qq_qw_qx_qr_s_y_tr_m{@q} = (1) x scalar(@q);
|
||
|
||
@q = qw(package);
|
||
@is_package{@q} = (1) x scalar(@q);
|
||
|
||
@q = qw( ? : );
|
||
push @q, ',';
|
||
@is_comma_question_colon{@q} = (1) x scalar(@q);
|
||
|
||
# Hash of other possible line endings which may occur.
|
||
# Keep these coordinated with the regex where this is used.
|
||
# Note: chr(13) = chr(015)="\r".
|
||
@q = ( chr(13), chr(29), chr(26) );
|
||
@other_line_endings{@q} = (1) x scalar(@q);
|
||
|
||
# These keywords are handled specially in the tokenizer code:
|
||
my @special_keywords = qw(
|
||
do
|
||
eval
|
||
format
|
||
m
|
||
package
|
||
q
|
||
qq
|
||
qr
|
||
qw
|
||
qx
|
||
s
|
||
sub
|
||
tr
|
||
y
|
||
);
|
||
push( @Keywords, @special_keywords );
|
||
|
||
# Keywords after which list formatting may be used
|
||
# WARNING: do not include |map|grep|eval or perl may die on
|
||
# syntax errors (map1.t).
|
||
my @keyword_taking_list = qw(
|
||
and
|
||
chmod
|
||
chomp
|
||
chop
|
||
chown
|
||
dbmopen
|
||
die
|
||
elsif
|
||
exec
|
||
fcntl
|
||
for
|
||
foreach
|
||
formline
|
||
getsockopt
|
||
if
|
||
index
|
||
ioctl
|
||
join
|
||
kill
|
||
local
|
||
msgctl
|
||
msgrcv
|
||
msgsnd
|
||
my
|
||
open
|
||
or
|
||
our
|
||
pack
|
||
print
|
||
printf
|
||
push
|
||
read
|
||
readpipe
|
||
recv
|
||
return
|
||
reverse
|
||
rindex
|
||
seek
|
||
select
|
||
semctl
|
||
semget
|
||
send
|
||
setpriority
|
||
setsockopt
|
||
shmctl
|
||
shmget
|
||
shmread
|
||
shmwrite
|
||
socket
|
||
socketpair
|
||
sort
|
||
splice
|
||
split
|
||
sprintf
|
||
state
|
||
substr
|
||
syscall
|
||
sysopen
|
||
sysread
|
||
sysseek
|
||
system
|
||
syswrite
|
||
tie
|
||
unless
|
||
unlink
|
||
unpack
|
||
unshift
|
||
until
|
||
vec
|
||
warn
|
||
while
|
||
given
|
||
when
|
||
);
|
||
@is_keyword_taking_list{@keyword_taking_list} =
|
||
(1) x scalar(@keyword_taking_list);
|
||
|
||
# perl functions which may be unary operators.
|
||
|
||
# This list is used to decide if a pattern delimited by slashes, /pattern/,
|
||
# can follow one of these keywords.
|
||
@q = qw(
|
||
chomp eof eval fc lc pop shift uc undef
|
||
);
|
||
|
||
@is_keyword_rejecting_slash_as_pattern_delimiter{@q} =
|
||
(1) x scalar(@q);
|
||
|
||
# These are keywords for which an arg may optionally be omitted. They are
|
||
# currently only used to disambiguate a ? used as a ternary from one used
|
||
# as a (depricated) pattern delimiter. In the future, they might be used
|
||
# to give a warning about ambiguous syntax before a /.
|
||
# Note: split has been omitted (see not below).
|
||
my @keywords_taking_optional_arg = qw(
|
||
abs
|
||
alarm
|
||
caller
|
||
chdir
|
||
chomp
|
||
chop
|
||
chr
|
||
chroot
|
||
close
|
||
cos
|
||
defined
|
||
die
|
||
eof
|
||
eval
|
||
evalbytes
|
||
exit
|
||
exp
|
||
fc
|
||
getc
|
||
glob
|
||
gmtime
|
||
hex
|
||
int
|
||
last
|
||
lc
|
||
lcfirst
|
||
length
|
||
localtime
|
||
log
|
||
lstat
|
||
mkdir
|
||
next
|
||
oct
|
||
ord
|
||
pop
|
||
pos
|
||
print
|
||
printf
|
||
prototype
|
||
quotemeta
|
||
rand
|
||
readline
|
||
readlink
|
||
readpipe
|
||
redo
|
||
ref
|
||
require
|
||
reset
|
||
reverse
|
||
rmdir
|
||
say
|
||
select
|
||
shift
|
||
sin
|
||
sleep
|
||
sqrt
|
||
srand
|
||
stat
|
||
study
|
||
tell
|
||
uc
|
||
ucfirst
|
||
umask
|
||
undef
|
||
unlink
|
||
warn
|
||
write
|
||
);
|
||
@is_keyword_taking_optional_arg{@keywords_taking_optional_arg} =
|
||
(1) x scalar(@keywords_taking_optional_arg);
|
||
|
||
# This list is used to decide if a pattern delmited by question marks,
|
||
# ?pattern?, can follow one of these keywords. Note that from perl 5.22
|
||
# on, a ?pattern? is not recognized, so we can be much more strict than
|
||
# with a /pattern/. Note that 'split' is not in this list. In current
|
||
# versions of perl a question following split must be a ternary, but
|
||
# in older versions it could be a pattern. The guessing algorithm will
|
||
# decide. We are combining two lists here to simplify the test.
|
||
@q = ( @keywords_taking_optional_arg, @operator_requestor );
|
||
@is_keyword_rejecting_question_as_pattern_delimiter{@q} =
|
||
(1) x scalar(@q);
|
||
|
||
# These are not used in any way yet
|
||
# my @unused_keywords = qw(
|
||
# __FILE__
|
||
# __LINE__
|
||
# __PACKAGE__
|
||
# );
|
||
|
||
# The list of keywords was originally extracted from function 'keyword' in
|
||
# perl file toke.c version 5.005.03, using this utility, plus a
|
||
# little editing: (file getkwd.pl):
|
||
# while (<>) { while (/\"(.*)\"/g) { print "$1\n"; } }
|
||
# Add 'get' prefix where necessary, then split into the above lists.
|
||
# This list should be updated as necessary.
|
||
# The list should not contain these special variables:
|
||
# ARGV DATA ENV SIG STDERR STDIN STDOUT
|
||
# __DATA__ __END__
|
||
|
||
@is_keyword{@Keywords} = (1) x scalar(@Keywords);
|
||
}
|
||
1;
|