#!/usr/bin/perl -w
#
# Fsdb::IO.pm
# Copyright (C) 2005-2019 by John Heidemann <johnh@isi.edu>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License,
# version 2, as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#
package Fsdb::IO;
=head1 NAME
Fsdb::IO - base class for Fsdb IO (FsdbReader and FsdbWriter)
=head1 EXAMPLES
There are several ways to do IO. We look at several that compute
the product of x and y for this input:
#fsdb x y product
1 10 -
2 20 -
The following routes go from most easy-to-use to least,
and also from least efficient to most.
For IO-intensive work, if fastpath takes 1 unit of time,
then using hashes or arrays takes approximately 2 units of time,
all due to CPU overhead.
=head2 Using A Hash
use Fsdb::IO::Reader;
use Fsdb::IO::Writer;
# preamble
my $out;
my $in = new Fsdb::IO::Reader(-file => '-', -comment_handler => \$out)
or die "cannot open stdin as fsdb\n";
$out = new Fsdb::IO::Writer(-file => '-', -clone => $in)
or die "cannot open stdin as fsdb\n";
# core starts here
my %hrow;
while ($in->read_row_to_href(\%hrow)) {
$hrow{product} = $hrow{x} * $hrow{y};
$out->write_row_from_href(\%hrow);
};
It can be convenient to use a hash because one can easily extract
fields using hash keys, but hashes can be slow.
=head2 Arrays Instead of Hashes
We can add a bit to end of the preamble:
my $x_i = $in->col_to_i('x') // die "no x column.\n";
my $y_i = $in->col_to_i('y') // die "no y column.\n";
my $product_i = $in->col_to_i('product') // die "no product column.\n";
And then replace the core with arrays:
my @arow;
while ($in->read_row_to_aref(\@arow)) {
$arow[$product_i] = $arow[$x_i] * $arow[$y_i];
$out->write_row_from_aref(\@arow);
};
This code has two advantages over hrefs:
First, there is explicit error checking for presence of
the expected fields.
Second, arrays are likely a bit faster than hashes.
=head2 Objects Instead of Arrays
Keeping the same preamble as for arrays,
we can directly get internal Fsdb "row objects"
with a new core:
# core
my $rowobj;
while ($rowobj = $in->read_rowobj) {
if (!ref($rowobj)) {
# comment
&{$in->{_comment_sub}}($rowobj);
next;
};
$rowobj->[$product_i] = $rowobj->[$x_i] * $rowobj->[$y_i];
$out->write_rowobj($rowobj);
};
This code is a bit faster because we just return the internal
representation (a rowobj),
rather than copy into an array.
However, unfortunately it doesn't handle comment processing.
=head2 Fastpathing
To go really fast, we can build a custom thunk
(a chunk of code) that does exactly what we want.
This approach is called a "fastpath".
It requires a bit more in the preamble (building on the array version):
my $in_fastpath_sub = $in->fastpath_sub();
my $out_fastpath_sub = $out->fastpath_sub();
And it allows a shorter core (modeled on rowobjs),
since the fastpath includes comment processing:
my $rowobj;
while ($rowobj = &$in_fastpath_sub) {
$rowobj->[$product_i] = $rowobj->[$x_i] * $rowobj->[$y_i];
&$out_fastpath_sub($rowobj);
};
This code is the fastest way to implement this block
without evaling code.
=head1 FUNCTIONS
=cut
@ISA = ();
$VERSION = 2.0;
use strict;
use IO::File;
use Carp;
=head2 new
$fsdb = new Fsdb::IO;
Creates a new IO object. Usually you should not create a FsdbIO object
directly, but instead create a C<FsdbReader> or C<FsdbWriter>.
Options:
=over 4
=item -fh FILE_HANDLE
Write IO to the given file handle.
=item -header HEADER_LINE
Force the header to the given HEADER_LINE
(should be verbatim, including #h or whatever).
=back
=item -fscode CODE
Define just the column (or field) separator fscode part of the header.
See L<dbfilealter> for a list of valid field separators.
=item -rscode CODE
Define just the row separator part of the header.
See L<dbfilealter> for a list of valid row separators.
=item -cols CODE
Define just the columns of the header.
=item -compression CODE
Define the compression mode for the file
that will take effect after the header.
=item -clone $fsdb
Copy the stream's configuration from $FSDB, another Fsdb::IO object.
=back
=cut
sub new {
my $class = shift @_;
my $self = bless {
# i/o source: one of:
_fh => undef, # filehandle to file
_encoding => undef, # encoding (defaults to :utf8)
_compression => undef,
_queue => undef,# ref to queue
_headerrow => undef,
_header_set => undef,
_header_prequel => undef,
# _attributes => {}, # arbitrary attributes for the file
# _attributes_set => undef,
# field (i.e., column) separator
_fscode => 'D', # -C option code, (D=default)
_fs => ' ', # field separator
_fsre => '\s+', # field separator
# row separators
_rscode => 'D', # -R (D=default, can be omitted; or R=rowized)
_empty => '-',
_cols => [], # array of names of the columns (fields)
_cols_to_i => {}, # reverse hash mapping names to offsets
_types => [], # array of types of the columns
_typespecs => [], # array of types of the columns as specified (could be null)
_fastpath_active => undef, # track fastpathing to avoid breaking it
_codifier_sub => undef, # converting perl code with embedded column names
_error => undef, # error status (should NEVER end in a newline)
}, $class;
return $self;
}
=head2 _reset_cols
$fsdb->_reset_cols
Internal: zero all the mappings in the curren schema.
=cut
sub _reset_cols {
my($self) = @_;
croak "Fsdb::IO::_reset_cols: attempted after _header_set\n"
if ($self->{_header_set});
$self->{_cols} = [];
$self->{_cols_to_i} = {};
$self->{_types} = [];
$self->{_typespecs} = [];
$self->{_headerrow} = undef;
$self->{_debug} = undef;
}
=head2 _find_filename_decompressor
returns the name of the decompression program for FILE if it ends in a compression
extension
=cut
sub _find_filename_decompressor($;$) {
my($file, $as_extension) = @_;
return undef if (!defined($file));
my($ext) = ($file =~ m/\.(gz|xz|bz2)$/);
$ext = $file if ($as_extension && !defined($ext));
return undef if (!defined($ext));
return "zcat" if ($ext eq "gz");
return "xzcat" if ($ext eq "xz");
return "bzcat" if ($ext eq "bz2");
return undef;
}
=head2 config_one
$fsdb->config_one($arglist_aref);
Parse the first configuration option on the list, removing it.
Options are listed in new.
=cut
sub config_one {
my($self, $aaref) = @_;
if ($aaref->[0] eq '-fh') {
shift @$aaref;
$self->{_fh} = shift @$aaref;
# should probably check ref to confirm IO::Handle
} elsif ($aaref->[0] eq '-queue') {
shift @$aaref;
$self->{_queue} = shift @$aaref;
croak "bad -queue argument" if (ref($self->{_queue}) !~ /^Fsdb::BoundedQueue/);
} elsif ($aaref->[0] eq '-header') {
shift @$aaref;
$self->{_headerrow} = shift @$aaref;
$self->parse_headerrow; # fill in col mappings, etc.
} elsif ($aaref->[0] eq '-fscode' || $aaref->[0] eq '-F') {
shift @$aaref;
my $code = shift @$aaref;
$self->parse_fscode($code);
$self->update_headerrow;
} elsif ($aaref->[0] eq '-rscode' || $aaref->[0] eq '-C') {
shift @$aaref;
my $code = shift @$aaref;
$self->parse_rscode($code);
$self->update_headerrow;
} elsif ($aaref->[0] eq '-cols') {
shift @$aaref;
my $col_ref = $aaref->[0]; shift @$aaref;
$self->_reset_cols;
foreach (@$col_ref) {
$self->_internal_col_create($_);
};
$self->update_headerrow;
} elsif ($aaref->[0] eq '-clone') {
shift @$aaref;
my($clone) = shift @$aaref;
$self->_reset_cols;
$self->parse_fscode($clone->fscode());
$self->parse_rscode($clone->rscode());
foreach my $cs ($clone->colspecs()) {
$self->_internal_col_create($cs);
};
$self->{_encoding} = $clone->{_encoding};
$self->{_compression} = $clone->{_compression};
$self->update_headerrow;
} elsif ($aaref->[0] eq '-encoding') {
shift @$aaref;
$self->{_encoding} = shift @$aaref;
} elsif ($aaref->[0] eq '-compression') {
shift @$aaref;
$self->{_compression} = shift @$aaref;
$self->{_compression} = undef if ($self->{_compression} && $self->{_compression} eq 'none');
my $decompressor = _find_filename_decompressor($self->{_compression}, 1);
$self->{_error} = "bad compression mode: " . $self->{_compression}
if ($self->{_compression} && !defined($decompressor));
$self->update_headerrow;
} elsif ($aaref->[0] eq '-debug') {
shift @$aaref;
$self->{_debug} = shift @$aaref;
} else {
croak("unknown option: " . $aaref->[0]);
};
}
=head2 config
$fsdb->config(-arg1 => $value1, -arg2 => $value2);
Parse all options in the list.
=cut
sub config ($@) {
my($self) = shift @_;
my(@args) = @_;
while ($#args >= 0) {
$self->config_one(\@args);
};
}
=head2 default_binmode
$fsdb->default_binmode();
Set the file to the correct binmode,
either given by C<-encoding> at setup,
or defaulting from C<LC_CTYPE> or C<LANG>.
If the file is compressed, we will reset binmode after reading the header.
=cut
sub default_binmode($) {
my($self) = shift @_;
if (!defined($self->{_encoding})) {
# foreach ($ENV{LC_CTYPE}, $ENV{LANG}, 'en.:utf8') {
# as of perl v5.16.3, UTF-8 segfaults
foreach ('en.:utf8') {
next if (!defined($_));
my($locale, $charset) = ($_ =~ /^([^\.]+)\.([^\.]+)/);
next if (!defined($charset));
$self->{_encoding} = $charset;
last;
};
};
my $mode = $self->{_encoding};
$mode = ":encoding($mode)" if ($mode !~ /^:/);
return $mode;
}
=head2 compare
$result = $fsdb->compare($other_fsdb)
Compares two Fsdb::IO objects, returning the strings
"identical" (same field separator, columns, and column order),
or maybe "compatible" (same field separator but different columns), or
undef if they differ.
=cut
sub compare ($$) {
my($self, $other) = @_;
return undef if ($self->{_error} || $other->{_error});
return undef if ($self->{_fscode} ne $other->{_fscode});
my @self_cols = @{$self->{_cols}};
my @other_cols = @{$other->{_cols}};
return "compatible" if ($#self_cols != $#other_cols);
foreach (0..$#self_cols) {
return "compatible" if ($self_cols[$_] ne $other_cols[$_]);
};
return 'identical';
}
=head2 close
$fsdb->close;
Closes the file, frees open file handle, or sends an EOF signal
(and undef) down the open queue.
=cut
sub close {
my($self) = @_;
return if ($self->{_error});
if (defined($self->{_fh})) {
$self->{_fh}->close;
delete $self->{_fh}; # help garbage collect auto-generated Symbols from IO::Handle
};
if (defined($self->{_queue})) {
$self->{_queue}->enqueue(undef);
delete $self->{_queue};
};
# reap any subprocesses
if (defined($self->{_hdfs_reader_pid})) {
waitpid $self->{_hdfs_reader_pid}, 0;
};
if (defined($self->{_compression_pid})) {
waitpid $self->{_compression_pid}, 0;
};
$self->{_error} = 'closed';
}
=head2 error
$fsdb->error;
Returns a descriptive string if there is an error,
or undef if not.
The string will never end in a newline or punctuation.
=cut
sub error {
my($self) = @_;
return $self->{_error};
}
=head2 update_v1_headerrow
internal: create the header the internal schema
=cut
sub update_v1_headerrow {
my $self = shift @_;
my $h = "#h ";
$h = "#L " if ($self->{_rscode} ne 'D');
if ($self->{_fscode} && $self->{_fscode} ne 'D') {
$h .= "-F" . $self->{_fscode} . " ";
};
if ($self->{_rscode} && $self->{_rscode} eq 'I') { # xxx: should be ne 'D'
$h .= "-R" . $self->{_rscode} . " ";
};
$h .= join(" ", @{$self->{_cols}});
$self->{_headerrow} = $h;
}
=head2 parse_v1_headerrow
internal: interpet the header
=cut
sub parse_v1_headerrow ($) {
my($self) = @_;
return if ($self->{_error});
my(@f) = split(/\s+/, $self->{_headerrow});
my $tag = shift @f;
if ($tag eq '#L') {
$self->{_rscode} = 'C';
} elsif ($tag ne "#h") {
$self->{_error} = "header line is not fsdb format";
return;
};
#
# handle options
#
while ($#f >= 0 && $f[0] =~ /^-(.)(.*)/) {
my($key, $value) = ($1, $2);
shift @f;
if ($key eq 'F') {
$self->parse_v1_fscode($value);
}
};
# create them!
foreach (@f) {
$self->_internal_col_create($_);
};
}
=head2 update_headerrow
internal: create the header the internal schema
=cut
sub update_headerrow($) {
my $self = shift @_;
my $h = "#fsdb ";
if ($self->{_fscode} && $self->{_fscode} ne 'D') {
$h .= "-F " . $self->{_fscode} . " ";
};
if ($self->{_rscode} && $self->{_rscode} ne 'D') { # xxx: should be ne 'D'
$h .= "-R " . $self->{_rscode} . " ";
};
if ($self->{_compression} && $self->{_compression} ne 'none') { # xxx: should be ne 'D'
$h .= "-Z " . $self->{_compression} . " ";
};
$self->{_header_prequel} = $h; # save this aside for dbcolneaten
$h .= join(" ", $self->colspecs());
$self->{_headerrow} = $h;
}
=head2 parse_headerrow
internal: interpet the v2 header.
Format is:
#fsdb [-F x] [-R x] [-Z x] columns
All options must come first, start with dashes, and have an argument.
(More regular than the v1 header.)
Columns have optional :t type specifiers.
=cut
sub parse_headerrow($) {
my($self) = @_;
return if ($self->{_error});
my(@f) = split(/\s+/, $self->{_headerrow});
my $tag = shift @f;
if ($tag eq '#fsdb') {
# fall through
} elsif ($tag eq '#L' || $tag eq '#h') {
return $self->parse_v1_headerrow;
} else {
$self->{_error} = "header line is not fsdb format";
return;
};
#
# handle options
#
while ($#f >= 0 && $f[0] =~ /^-/) {
my($key) = shift @f;
my($value) = shift @f;
if ($key eq '-F') {
$self->parse_fscode($value);
} elsif ($key eq '-R') {
$self->parse_rscode($value);
} elsif ($key eq '-Z') {
$self->parse_compression($value);
} else {
$self->{_error} = "header has unknown option " . $key;
return;
};
};
# create them!
foreach (@f) {
$self->_internal_col_create($_);
};
}
=head2 parse_v1_fscode
internal
=cut
sub parse_v1_fscode {
my $self = shift @_;
my $code = shift @_;
if ($code =~ /^[DsSt]$/) {
$self->parse_fscode($code);
} else {
# Ick. Old way. Not very safe.
# Take char itself as code.
$self->parse_fscode("C$code");
};
}
=head2 parse_fscode
Parse the field separator.
See L<dbfilealter> for a list of valid values.
=cut
sub parse_fscode {
my $self = shift @_;
my $code = shift @_;
my ($fsre, $outfs);
if (!defined($code) || $code eq 'D') { # default
$fsre = '\s+'; # "[ \t\n]+";
$outfs = "\t";
$code = 'D'; # always leave it defined so eq/ne work
} elsif ($code eq 's') { # single space
$fsre = '\s+';
$outfs = " ";
} elsif ($code eq 'S') { # double space
$fsre = '\s\s+';
$outfs = " ";
} elsif ($code eq 't') { # single tab
$fsre = "\t";
$outfs = "\t";
} elsif ($code =~ /^X(.*)$/) { # hex value
my $real_code = chr(hex($1));
$fsre = "[$real_code]+";
$outfs = $real_code;
} elsif ($code =~ /^C(.)$/) { # character value
my $real_code = $1;
$fsre = "[$real_code]+";
$outfs = $real_code;
} else {
$self->{_error} = "bad field separator given ($code)";
return;
};
$self->{_fscode} = $code;
$self->{_fsre} = $fsre;
$self->{_fs} = $outfs;
}
=head2 parse_rscode
Internal: Interpret rscodes.
See L<dbfilealter> for a list of valid values.
=cut
sub parse_rscode($$) {
my($self, $code) = @_;
$code = 'D' if (!defined($code));
$self->{_error} = "invalid rscode: $code"
if (!($code eq 'D' || $code eq 'C' || $code eq 'I'));
$self->{_rscode} = $code;
}
=head2 parse_compression
Internal: Interpret compression.
See L<dbfilealter> for a list of valid values.
=cut
sub parse_compression($$) {
my($self, $code) = @_;
$code = 'none' if (!defined($code));
$self->{_error} = "invalid compression: $code"
if (!($code eq 'none' || $code eq 'gz' || $code eq 'xz' || $code eq 'bz2'));
$self->{_compression} = $code;
}
=head2 establish_new_col_mapping
internal
=cut
sub establish_new_col_mapping {
my($self, $colspec) = @_;
my($colname, $type, $typespec) = $self->colspec_to_name_type_spec($colspec);
my $coli = $#{$self->{_cols}} + 1;
$self->{_cols}->[$coli] = $colname;
$self->{_cols_to_i}->{$colname} = $coli;
# Old.pm also registers _$colname, but that seems Wrong.
$self->{_cols_to_i}->{"$coli"} = $coli; # numeric synonym
$self->{_types}->[$coli] = $type;
$self->{_typespecs}->[$coli] = $typespec;
$self->{_codifier_sub} = undef; # clear cache
}
=head2 col_create
$fsdb->col_create($col_name)
Add a new column named $COL_NAME to the schema.
Returns undef on failure, or 1 if sucessful.
(Note: does I<not> return the column index on creation because
so that C<or> can be used for error checking,
given that the column number could be zero.)
Also, update the header row to reflect this column
(compare to C<_internal_col_create>).
=cut
sub col_create {
my $self = shift @_;
$self->_internal_col_create(@_) and
$self->update_headerrow;
}
=head2 colspec_to_name_type_spec
($name, $type, $type_speced) = $fsdb->colspec_to_name_type($colspec)
Split a colspec into a name, type, and the type as specified
(which may be null if no type was given).
=cut
sub colspec_to_name_type_spec($$) {
my($self, $colspec) = @_;
my($name, $type) = split(/:/, $colspec);
return($name, $type // 'a', $type);
};
=head2 _internal_col_create
$fsdb->_internal_col_create($colspec)
For internal C<Fsdb::IO> use only.
Create a new column $COL_NAME,
just like C<col_create>,
but do I<not> update the header row
(as that function does).
=cut
sub _internal_col_create {
my($self, $colspec) = @_;
my($colname, $type, $typespec) = $self->colspec_to_name_type_spec($colspec);
if ($self->{_header_set}) {
$self->{_error} = "attempt to add column to frozen fsdb handle (reader or writer that's been written to): $colname";
return undef;
};
if (defined($self->col_to_i($colname))) {
$self->{_error} = "duplicate col definition: $colname";
return undef;
};
$self->establish_new_col_mapping($colspec);
return 1;
}
=head2 field_contains_fs
$boolean = $fsdb->field_contains_fs($field);
Determine if the $FIELD contains $FSDB's fscode
(in which case it is malformed).
=cut
sub field_contains_fs {
my($self, $field) = @_;
return ($field =~ /$self->{_fsre}/);
}
=head2 fref_contains_fs
$boolean = $fsdb->fref_contains_fs($fref);
Determine if any field in $FREF contains $FSDB's fscode
(in which case it is malformed).
=cut
sub fref_contains_fs {
my($self, $fref) = @_;
foreach (@$fref) {
return 1 if ($_ =~ /$self->{_fsre}/);
};
return 0;
}
=head2 correct_fref_containing_fs
$boolean = $fsdb->correct_fref_containing_fs($fref);
Patch up any field in $FREF contains $FSDB's fscode, as best as possible,
but turning the field separator into underscores.
Updates $FREF in place, and returns if it was altered.
This function looses data.
=cut
sub correct_fref_containing_fs {
my($self, $fref) = @_;
my $changed = undef;
foreach (0..$#$fref) {
$changed = 1 if ($fref->[$_] =~ s/$self->{_fsre}/_/g);
};
return $changed;
}
=head2 fscode
$fscode = $fsdb->fscode;
Returns the fscode of the given database.
(The encoded verison representing the field separator.)
See also fs to get the actual field separator.
=cut
sub fscode {
my($self) = @_;
return $self->{_fscode};
}
=head2 fs
$fscode = $fsdb->fs;
Returns the field separator.
See C<fscode> to get the "encoded" version.
=cut
sub fs {
my($self) = @_;
return $self->{_fs};
}
=head2 rscode
$rscode = $fsdb->rscode;
Returns the rscode of the given database.
=cut
sub rscode {
my($self) = @_;
return $self->{_rscode};
}
=head2 ncols
@fields = $fsdb->ncols;
Return the number of columns.
=cut
sub ncols {
my($self) = @_;
return $#{$self->{_cols}} + 1;
}
=head2 cols
$fields_aref = $fsdb->cols;
Returns the column names (the field names, without type specifications)
of the open database
as an aref.
=cut
sub cols($) {
my($self) = @_;
return $self->{_cols};
}
=head2 colspecs
$fields_aref = $fsdb->colspecs();
Returns the column headings (the field names) of the open database
as an aref.
=cut
sub colspecs($) {
my($self) = @_;
my(@cs) = ();
foreach (0..$#{$self->{_cols}}) {
push(@cs, $self->col_to_colspec($_));
};
return(@cs);
}
=head2 col_to_i
$coli = $fsdb->col_to_i($column_name);
Returns the column index (0-based) of a given $COLUMN_NAME.
(Names cannot have types with them.)
Note: tests for existence of columns must use C<defined>,
since the index can be 0 which would be interpreted as false.
=cut
sub col_to_i {
my($self, $n) = @_;
return $self->{_cols_to_i}->{$n};
}
=head2 colspec_to_i
$coli = $fsdb->colspec_to_i($column_specification);
Returns the column index (0-based) of a given $COLUMN_NAME.
Name may or may not include a type.
Note: tests for existence of columns must use C<defined>,
since the index can be 0 which would be interpreted as false.
=cut
sub colspec_to_i {
my($self, $cs) = @_;
my($n) = $self->colspec_to_name_type_spec($cs);
return $self->{_cols_to_i}->{$n};
}
=head2 col_to_name
@fields = $fsdb->col_to_name($column_name);
Returns the column anme a given $COLUMN_NAME_OR_INDEX.
=cut
sub col_to_name {
my($self, $n_or_i) = @_;
return $self->{_cols}->[$self->{_cols_to_i}->{$n_or_i}];
}
=head2 col_to_type
@fields = $fsdb->col_to_type($column_name, $force_type);
Returns the column type (and undef if type is not required, unless $FORCE_TYPE)
of a given $COLUMN_NAME.
=cut
sub col_to_type($$;$) {
my($self, $n, $force_type) = @_;
my($i) = $self->{_cols_to_i}->{$n};
return $force_type ? $self->{_type}->[$i] : $self->{_typespecs}->[$i];
}
=head2 col_to_colspec
@fields = $fsdb->col_to_colspec($column_name, $force_type);
Returns the column specification (type is optional, unless $FORCE_TYPE) of a given $COLUMN_NAME.
=cut
sub col_to_colspec($$;$) {
my($self, $n, $force_type) = @_;
my($i) = $self->{_cols_to_i}->{$n};
my($cs) = $self->{_cols}->[$i];
my $type = $force_type ? $self->{_type}->[$i] : $self->{_typespecs}->[$i];
return defined($type) ? "$cs:$type" : $cs;
}
=head2 col_type_is_numeric
@fields = $fsdb->col_type_is_numeric($column_name);
Returns non-zero if column specification is numeric.
(Actually, returns 1 for integers and 2 for floats.)
=cut
sub col_type_is_numeric($$) {
my($self, $n) = @_;
my($i) = $self->{_cols_to_i}->{$n};
return undef if (!defined($i));
my $type = substr($self->{_types}->[$i], 0, 1);
if ($type eq 'a') {
return 0;
} elsif ($type eq 'f' || $type eq 'd') {
return 2; # float family
} else {
return 1; # int family
};
}
=head2 i_to_col
@fields = $fsdb->i_to_col($column_index);
Return the name of the COLUMN_INDEX-th (0-based) column.
=cut
sub i_to_col {
my($self, $i) = @_;
return $self->{_cols}->[$i];
}
# =head2 attributes
#
# %attributes = $fsdb->attributes;
#
# Returns (a copy of) all attributes for the file (if any).
#
# =cut
#
# sub attributes() {
# my $self = shift @_;
# $self->check_attributes;
# return %{$self->{_attributes}};
# }
#
# =head2 attribute
#
# $an_attribute = $fsdb->attribute('empty');
#
# Returns one attribute of the file (if any).
#
# =cut
#
# sub attribute() {
# my $self = shift @_;
# $self->check_attributes;
# return $self->{_attributes}{$_[0]};
# }
#
# =head2 set_attribute
#
# $fsdb->set_attribute('empty', '-');
#
# Sets one attribute of the file.
#
# =cut
#
# sub set_attribute() {
# my $self = shift @_;
# $self->check_attributes;
# $self->{_attributes}{$_[0]} = $_[1];
# }
=head2 fastpath_cancel
$fsdb->fastpath_cancel();
Discard any active fastpath code and allow fastpath-incompatible operations.
=cut
sub fastpath_cancel {
my $self = shift @_;
# Just an honor code, we can't actually reach out and invalidate
# the fastpath code. :-(
$self->{_fastpath_active} = undef;
}
=head2 codify
($code, $has_last_refs) = $self->codify($underscored_pseudocode);
Convert db-code C<$UNDERSCORED_PSEUDOCODE> into perl code
in the context of a given Fsdb stream.
We return a string of code C<$CODE>
that refs C<@{$fref}> and C<@{$lfref}>
for the current and prior row arrays,
and a flag C<$HAS_LAST_REFS> if C<@{$lfref}> is needed.
It is the callers job to set these up,
probably by evaling the returned string in the context of those variables.n
The conversion is a rename of all _foo's into
database fields.
For more perverse needs, _foo(N) means the Nth field after _foo.
Also, as of 29-Jan-00, _last_foo gives the last row's value
(_last_foo(N) is not supported).
To convert we eval $codify_code.
20-Feb-07: _FROMFILE_foo opens the file called _foo and includes it in place.
NEEDSWORK: Should make some attempt to catch misspellings of column
names.
=cut
sub codify {
my $self = shift @_;
if (!defined($self->{_codifier_sub})) {
#
# Here we generate an anon sub that takes
# its args (@_) as code and returns them
# as one string of fixed code that refs @{$fref} and @{$lfref}.
#
my $codify_code = "sub {\n" .
'my $has_lfrefs = undef;' . "\n" .
'my $c = join(";", @_);' . "\n";
foreach (@{$self->cols}) {
# xxx:
# # indirect @_foo
# $codify_code .= 'if ($c =~ m/\b\_FROMFILE\(\_' . quotemeta($_) . '\)\b/) { ' .
## ' my $c = slurpfile($c[' . $colnametonum{$_} . ']); ' .
## ' my $c = "foo"; ' .
## ' s/\b\_FROMFILE\(\_' . quotemeta($_) . '\)\b/$c/g; ' .
# ' $c =~ s/\b\_FROMFILE\(\_' . quotemeta($_) . '\)\b/foo/g; ' .
# '};' . "\n";
# $codify_code .= '$c =~ s/\b\_FROMFILE\(\_' . quotemeta($_) . '\)\b/\$c\[' . $colnametonum{$_} . '\]/g;' . "\n";
# _foo(N) [perverse]
$codify_code .= "\t" . '$c =~ s/\b\_' . quotemeta($_) . '(\(.*\))/\$fref->\[' . $self->col_to_i($_) . '+$1\]/g;' . "\n";
# _foo
$codify_code .= "\t" . '$c =~ s/\b\_' . quotemeta($_) . '\b/\$fref->\[' . $self->col_to_i($_) . '\]/g;' . "\n";
$codify_code .= "\t" . '$has_lfrefs = 1 if ($c =~ /\b\_last\_' . quotemeta($_) . '\b/);' . "\n";
# _last_foo
$codify_code .= "\t" . '$c =~ s/\b\_last\_' . quotemeta($_) . '\b/\$lfref->\[' . $self->col_to_i($_) . '\]/g;' . "\n";
};
# print "CODE: $codify_code\n";
$codify_code .= "\t" . 'return ($c, $has_lfrefs);' . "\n};\n";
my $codify_sub;
eval "\$codify_sub = $codify_code;";
croak "cannot eval code:\n\t$@\n\t$codify_code\n" if ($@ ne '');
$self->{_codifier_sub} = $codify_sub;
};
#
# do it!
#
return &{$self->{_codifier_sub}}(@_);
}
=head2 clean_potential_columns
@clean = Fsdb::IO::clean_potential_columns(@dirty);
Clean up user-provided column names.
=cut
sub clean_potential_columns {
# normalize field names
grep(s/^\s+//, @_);
grep(s/\s+$//, @_);
grep(s/\s+/_/g, @_);
return @_;
}
1;