phadsconsult.com - GrazzMean

info server
Uname: Linux web3.us.cloudlogin.co 5.10.226-xeon-hst #2 SMP Fri Sep 13 12:28:44 UTC 2024 x86_64
Software: Apache
PHP version: 8.1.31 [ PHP INFO ] PHP os: Linux
Server Ip: 162.210.96.117
Your Ip: 18.217.209.151
User: edustar (269686) | Group: tty (888)
Safe Mode: OFF
Disable Function:
NONE
upload mass deface mass delete console
name : dbpipeline.pm
#!/usr/bin/perl -w

#
# dbpipeline.pm
# Copyright (C) 2007-2019 by John Heidemann <johnh@isi.edu>
#
# This program is distributed under terms of the GNU general
# public license, version 2.  See the file COPYING
# in $dblibdir for details.
#

package Fsdb::Filter::dbpipeline;

=head1 NAME

dbpipeline - allow db commands to be assembled as pipelines in Perl

=head1 SYNOPSIS

    use Fsdb::Filter::dbpipeline qw(:all);
    dbpipeline(
        dbrow(qw(name test1)),
        dbroweval('_test1 += 5;')
    );

Or for more customized versions, see
L</dbpipeline_filter>,
L</dbpipeline_sink>,
L</dbpipeline_open2>,
and
L</dbpipeline_close2_hash>.


=head1 DESCRIPTION

This module makes it easy to create pipelines in Perl
using separate processes.
(In the past we used to use perl threads.)

By default (as with all Fsdb modules), input is from STDIN and output
to STDOUT.  Two helper functions, fromfile and tofile can grab
data from files.

Dbpipeline differs in several ways from all other Fsdb::Filter modules:
it does not have a corresponding Unix command (it is used only from
within Perl).
It does not log its presence to the output stream (this is arguably a bug,
but it doesn't actually do anything).


=head1 OPTIONS

Unlike most Fsdb modules, dbpipeline defaults to C<--autorun>.

=for comment
begin_standard_fsdb_options

This module also supports the standard fsdb options:

=over 4

=item B<-d>

Enable debugging output.

=item B<-i> or B<--input> InputSource

Read from InputSource, typically a file name, or C<-> for standard input,
or (if in Perl) a IO::Handle, Fsdb::IO or Fsdb::BoundedQueue objects.

=item B<-o> or B<--output> OutputDestination

Write to OutputDestination, typically a file name, or C<-> for standard output,
or (if in Perl) a IO::Handle, Fsdb::IO or Fsdb::BoundedQueue objects.

=item B<--autorun> or B<--noautorun>

By default, programs process automatically,
but Fsdb::Filter objects in Perl do not run until you invoke
the run() method.
The C<--(no)autorun> option controls that behavior within Perl.

=item B<--header> H

Use H as the full Fsdb header, rather than reading a header from
then input.  This option is particularly useful when using Fsdb
under Hadoop, where split files don't have heades.

=item B<--help>

Show help.

=item B<--man>

Show full manual.

=back

=for comment
end_standard_fsdb_options


=head1 SEE ALSO

L<Fsdb(3)>


=head1 CLASS FUNCTIONS

=cut

@ISA = qw(Fsdb::Filter);
($VERSION) = 2.0;

use Exporter 'import';
@EXPORT = ();
@EXPORT_OK = qw(
    dbpipeline_filter
    dbpipeline_sink
    dbpipeline_open2
    dbpipeline_close2_hash
);
# update them here, in toplevel Makefile.PL, and below in the documentation.
# and add dbpipeline to this list
# BEGIN AUTOGENERATED VARIABLE SECTION
# This next section is managed by update_modules.  DO NOT EDIT DIRECTLY.
our @modularized_db_programs = qw(
	dbcol
	dbcolcopylast
	dbcolcreate
	dbcoldefine
	dbcolhisto
	dbcolmerge
	dbcolmovingstats
	dbcolneaten
	dbcolpercentile
	dbcolrename
	dbcolscorrelate
	dbcolsplittocols
	dbcolsplittorows
	dbcolsregression
	dbcolstats
	dbcolstatscores
	dbfilealter
	dbfilecat
	dbfilediff
	dbfilepivot
	dbfilestripcomments
	dbfilevalidate
	dbformmail
	dbjoin
	dbmapreduce
	dbmerge
	dbmerge2
	dbmultistats
	dbrow
	dbrowaccumulate
	dbrowcount
	dbrowdiff
	dbroweval
	dbrowuniq
	dbrvstatdiff
	dbsort
);
our @modularized_db_converters = qw(
	cgi_to_db
	combined_log_format_to_db
	csv_to_db
	db_to_csv
	db_to_html_table
	html_table_to_db
	kitrace_to_db
	mysql_to_db
	tabdelim_to_db
	tcpdump_to_db
	xml_to_db
);
our @modularized_db_non_programs = qw(
	dbpipeline
	dbsubprocess
);
# END AUTOGENERATED VARIABLE SECTION
%EXPORT_TAGS = (all => [@EXPORT_OK, @modularized_db_programs, @modularized_db_converters, @modularized_db_non_programs]);
Exporter::export_ok_tags('all');
my %autoloadable = map { $_ => 1 } @modularized_db_programs, @modularized_db_converters, @modularized_db_non_programs;

use strict;
use Pod::Usage;
use Carp;
use IO::Pipe;

use Fsdb::Filter;
use Fsdb::IO::Reader;
use Fsdb::IO::Writer;
use Fsdb::Support::Freds;

#
# First off, create all the bindings we promise in EXPORT_TAGS.
# Automated via AUTOLOAD for extra coolness.
#
our $AUTOLOAD;
sub AUTOLOAD {
    my $sub = $AUTOLOAD;
    (my $localsub = $sub) =~ s/.*:://;
    croak("dbpipeline: AUTOLOAD on non-autoloadable sub $sub\n")
	if (!defined($autoloadable{$localsub}));
    eval "sub $localsub { use Fsdb::Filter::$localsub; return new Fsdb::Filter::$localsub(" . '@_' . "); };\n";
    $@ and croak("dbpipeline: error creating stubs: $@\n");
    goto &$sub;
}

sub DESTROY {
    # just suppress autoloading warnings
}

=head2 dbpipeline

    dbpipeline(@modules);

This shorthand-routine creates a dbpipeline object
and then I<immediately runs it>.

Thus perl code becomes nearly as terse as shell code:

    dbpipeline(
	dbcol(qw(name test1)),
	dbroweval('_test1 += 5;'),
    );

The following commands currently have shorthand aliases:

=for comment
BEGIN AUTOGENERATED DOCUMENTATION SECTION

=over

=item L<cgi_to_db(1)>

=item L<combined_log_format_to_db(1)>

=item L<csv_to_db(1)>

=item L<db_to_csv(1)>

=item L<db_to_html_table(1)>

=item L<dbcol(1)>

=item L<dbcolcopylast(1)>

=item L<dbcolcreate(1)>

=item L<dbcoldefine(1)>

=item L<dbcolhisto(1)>

=item L<dbcolmerge(1)>

=item L<dbcolmovingstats(1)>

=item L<dbcolneaten(1)>

=item L<dbcolpercentile(1)>

=item L<dbcolrename(1)>

=item L<dbcolscorrelate(1)>

=item L<dbcolsplittocols(1)>

=item L<dbcolsplittorows(1)>

=item L<dbcolsregression(1)>

=item L<dbcolstats(1)>

=item L<dbcolstatscores(1)>

=item L<dbfilealter(1)>

=item L<dbfilecat(1)>

=item L<dbfilediff(1)>

=item L<dbfilepivot(1)>

=item L<dbfilestripcomments(1)>

=item L<dbfilevalidate(1)>

=item L<dbformmail(1)>

=item L<dbjoin(1)>

=item L<dbmapreduce(1)>

=item L<dbmerge(1)>

=item L<dbmerge2(1)>

=item L<dbmultistats(1)>

=item L<dbrow(1)>

=item L<dbrowaccumulate(1)>

=item L<dbrowcount(1)>

=item L<dbrowdiff(1)>

=item L<dbroweval(1)>

=item L<dbrowuniq(1)>

=item L<dbrvstatdiff(1)>

=item L<dbsort(1)>

=item L<html_table_to_db(1)>

=item L<kitrace_to_db(1)>

=item L<mysql_to_db(1)>

=item L<tabdelim_to_db(1)>

=item L<tcpdump_to_db(1)>

=item L<xml_to_db(1)>

=back

=for comment
END AUTOGENERATED DOCUMENTATION SECTION

and

=over 4

=item L<dbsubprocess(3)>

=back


=cut

=head2 dbpipeline_filter

    my($result_reader, $fred) = dbpipeline_filter($source, $result_reader_aref, @modules);

Set up a pipeline of @MODULES that filters data pushed through it, 
where the data comes from $SOURCE
(any L<Fsdb::Filter::parse_io_option> object,
like a Fsdb::IO::Reader object, queue, or filename).

Returns a $RESULT_READER Fsdb::IO::Reader object,
created with $RESULT_READER_AREF as options.
This reader will produce the filtered data,
and a $FRED that must be joined to guarantee output
has completed.

Or if $RESULT_READER_AREF is C<[-raw_fh,  1]>, it just returns the IO::Handle
to the pipe.

As an example, this code uses C<dbpipeline_filter> to insure the input
(from C<$in> which is a filename or L<Fsdb::IO::Reader>) is sorted
numerically by column C<x>:

    use Fsdb::Filter::dbpipeline qw(dbpipeline_filter dbsort);
    my($new_in, $new_fred) = dbpipeline_filter($in,
	[-comment_handler => $self->create_delay_comments_sub],
	dbsort(qw(--nolog -n x)));
    while (my $fref = $new_in->read_rowwobj()) {
	# do something
    };
    $new_in->close;
    $new_fred->join();

=cut

sub dbpipeline_filter($@) {
    my($pre_filter_source) = shift @_;
    my($post_filter_reader_aref) = shift @_;
    my(@args) = @_;

    my $pipe = new IO::Pipe;

    my $pipeline_fred = new Fsdb::Support::Freds('dbpipeline_filter',
	sub {
	    $pipe->writer();
	    my $pipeline = new Fsdb::Filter::dbpipeline(
		    '--noautorun',
		    '--input' => $pre_filter_source,
		    '--output' => $pipe,
		    @args);
	    $pipeline->setup_run_finish;
	    exit 0;
	});
    if ($pipeline_fred->error()) {
	$pipe->close;
	return (undef, $pipeline_fred);
    };
    $pipe->reader();
    # Next line will block until pipeline produces the header!
    if ($#{$post_filter_reader_aref} >= 1
	    && $post_filter_reader_aref->[0] eq '-raw_fh'
	    && $post_filter_reader_aref->[1]) {
	return ($pipe, $pipeline_fred);
    };
    my $post_filter = new Fsdb::IO::Reader(-fh => $pipe, @$post_filter_reader_aref);
    return ($post_filter, $pipeline_fred);
}


=head2 dbpipeline_sink

    my($fsdb_writer, $fred) = dbpipeline_sink($writer_arguments_aref, @modules);

Set up a pipeline of @MODULES that is a data "sink", where the output
is given by a C<--output> argument, or goes to standard output (by default).
The caller generates input into the pipeline 
by writing to a newly created $FSDB_WRITER,
whose configuration is specified by the mandatory first
argument $WRITER_ARGUMENTS_AREF.
(These arguments should include the schema.)
Returns this writer, and a $FRED that must be joined to guarantee output
has completed.

If the first argument to modules is "--fred_exit_sub",
then the second is taken as a CODE block that runs at fred exit
(and the two are not passed to modules).

If the first argument to modules is "--fred_description",
then the second is taken as a text description of the Fred.

=cut

sub dbpipeline_sink($@) {
    my($writer_aref) = shift @_;
    my(@args) = @_;

    my $fred_exit_sub = undef;
    my $fred_desc = 'dbpipeline_sink';
    for (;;) {
	last if ($#args == -1 || ref($args[0]) ne '');
	if ($args[0] eq '--fred_exit_sub') {
	    shift @args;
	    $fred_exit_sub = shift @args;
	} elsif ($args[0] eq '--fred_description') {
	    shift @args;
	    $fred_desc = shift @args;
	} else {
	    last;
	};
    };
    
    my $pipe = new IO::Pipe;

    my $pipeline_fred = new Fsdb::Support::Freds($fred_desc,
	sub {
	    $pipe->reader();
	    my $pipeline = new Fsdb::Filter::dbpipeline(
		    '--noautorun',
		    '--input' => $pipe,
		    @args);
	    $pipeline->setup_run_finish;
	    exit 0;
	}, $fred_exit_sub);
    if ($pipeline_fred->error()) {
	$pipe->close;
	return (undef, $pipeline_fred);
    };
    $pipe->writer();
    my $writer = new Fsdb::IO::Writer(-fh => $pipe, @$writer_aref);
    return ($writer, $pipeline_fred);
}

=head2 dbpipeline_open2

    my($fsdb_reader_fh, $fsdb_writer, $fred) = 
	dbpipeline_open2($writer_arguments_aref, @modules);

Set up a pipeline of @MODULES that is a data sink and source (both!).
The caller generates input into the pipeline 
by writing to a newly created $FSDB_WRITER,
whose configuration is specified by the mandatory 
argument $WRITER_ARGUMENTS_AREF.
These arguments should include the schema.)
The output of the pipeline comes out to the newly
created $FSDB_READER_FH.
Returns this read queue and writer,
and a $PID that must be joined to guarantee output
has completed.

(Unfortunately the interface is asymmetric with a read I<queue>
but a write C<Fsdb::IO> object, because C<Fsdb::IO::Reader> blocks on
input of the header.)

Like L<IPC::Open2>, with all of its pros and cons like potential deadlock.

=cut

sub dbpipeline_open2 ($@) {
    my($writer_aref) = shift @_;
    my(@args) = @_;
    
    my $into_pipeline_pipe = new IO::Pipe;
    my $from_pipeline_pipe = new IO::Pipe;
    my $pipeline_fred = new Fsdb::Support::Freds('dbpipeline_open2',
	sub {
	    $into_pipeline_pipe->reader();
	    $from_pipeline_pipe->writer();
	    my $pipeline = new Fsdb::Filter::dbpipeline(
			'--noautorun',
			'--input' => $into_pipeline_pipe,
			'--output' => $from_pipeline_pipe,
			@args);
	    $pipeline->setup_run_finish;
	    exit 0;
	});
    if ($pipeline_fred->error()) {
	$into_pipeline_pipe->close;
	$from_pipeline_pipe->close;
	return (undef, undef, $pipeline_fred);
    };
    # can't make a reader here since it will block on the header
#    my $reader = new Fsdb::IO::Reader(-queue => $from_pipeline_queue, @$reader_aref);
    $into_pipeline_pipe->writer();
    $from_pipeline_pipe->reader();
    my $writer = new Fsdb::IO::Writer(-fh => $into_pipeline_pipe, @$writer_aref);
    return ($from_pipeline_pipe, $writer, $pipeline_fred);
}

=head2 dbpipeline_close2_hash

    my($href) = dbpipeline_close2_hash($fsdb_read_fh, $fsdb_writer, $pid);

Reads and returns one line of output from $FSDB_READER,
after closing $FSDB_WRITER and joining the $PID.

Useful, for example, to get L<dbcolstats> output cleanly.

=cut

sub dbpipeline_close2_hash ($$$) {
    my($read_fh, $writer, $fred) = @_;
    $writer->close if (defined($writer));
    if (defined($fred)) {
	$fred->join();
    };
    my %out_hash;
    my $reader = new Fsdb::IO::Reader(-fh => $read_fh);
    $reader->error and croak("dbpipeline_close2_hash: couldn't setup reader.\n");
    $reader->read_row_to_href(\%out_hash) or croak("dbpipeline_close2_hash: no output from pipeline.\n");
    # check for eof
    my $o;
    while ($o = $reader->read_rowobj) {
	next if (!ref($o));  # comment
	# data is bad
	$o and croak("dbpipeline_close2_hash: multiple lines of output.\n");
    };
    return \%out_hash;
}


=head2 new

    $filter = new Fsdb::Filter::dbpipeline(@arguments);

=cut

sub new {
    my $class = shift @_;
    my $self = $class->SUPER::new(@_);
    bless $self, $class;
    $self->set_defaults;
    $self->parse_options(@_);
    $self->SUPER::post_new();
    return $self;
}


=head2 set_defaults

    $filter->set_defaults();

Internal: set up defaults.

=cut

sub set_defaults ($) {
    my($self) = @_;
    $self->{_modules} = [];
    $self->SUPER::set_defaults();
    $self->{_autorun} = 1;  # override superclass default
    $self->{_header} = undef;
}

=head2 parse_options

    $filter->parse_options(@ARGV);

Internal: parse options

=cut

sub parse_options ($@) {
    my $self = shift @_;

    my @argv = @_;
    $self->get_options(
	\@argv,
 	'help|?' => sub { pod2usage(1); },
	'man' => sub { pod2usage(-verbose => 2); },
	'autorun!' => \$self->{_autorun},
	'close!' => \$self->{_close},
	'd|debug+' => \$self->{_debug},
	'header=s' => \$self->{_header},
	'i|input=s' => sub { $self->parse_io_option('input', @_); },
	'log!' => \$self->{_logprog},
	'o|output=s' => sub { $self->parse_io_option('output', @_); },
	) or pod2usage(2);
    push(@{$self->{_modules}}, @argv);
}

=head2 setup

    $filter->_reap();

Internal: reap any forked threads.

=cut

sub _reap($) {
    my($self) = @_;
    # wait for termination of everyone, in order (it's a pipeline)
    while ($#{$self->{_freds}} >= 0) {
	my $fred = shift @{$self->{_freds}};
	my $res = $fred->join();
	if ($res == -1) {
	    print STDERR "dbpipeline: join on fred returns error.\n"
		if ($self->{_debug});
	};
	if ($fred->exit_code() != 0) {
	    print STDERR "dbpipeline: child returns exit code " . $fred->exit_code() . "\n"
		if ($self->{_debug});
	};
    };
}


=head2 setup

    $filter->setup();

Internal: setup, parse headers.

=cut

sub setup ($) {
    my($self) = @_;

    my $prev_module_i = $#{$self->{_modules}};
    croak($self->{_prog} . ": no modules in pipeline.\n")
        if ($prev_module_i < 0);

    #
    # Make sure module inputs are sensible.
    #
    my $prev_mod = undef;
    my $i = 0;
    my $mod;
    foreach $mod (@{$self->{_modules}}) {
	croak($self->{_prog} . ": module $i isn't type Fsdb::Filter.\n")
	    if (ref($mod) !~ /^Fsdb::Filter/);
	if (defined($prev_mod)) {
	    croak($self->{_prog} . ": incompatible module input and output between modules $i and " . $i+1 . ".\n")
		if ($prev_mod->info('output_type') ne $mod->info('input_type'));
	    # xxx: above is a bit too strict, since fsdbtext should match fsdb*
	};

	$prev_mod = $mod;
	$i++;
    };

    #
    # Everything benign has now happend.
    #
    # Now fork off processes for each child.
    # Ideally we would do that in run,
    # except that perl has problems when a pipe (like stdin) is
    # opened in one thread and used in another---it trys to lseek
    # and gives up when lseek fails on the pipe.
    # Sigh.  These details are all 5.8.8... maybe 5.10 fixes them?
    #
    # Setup children.
    # Built a queue to tell us when to reap them.
    #
    # Fork of each, in order, then we run the final one.
    #
    $self->{_freds} = [];
    $i = 0;
    my $final_mod_i = $#{$self->{_modules}};
    my $prev_pipe = undef;
    my $err = undef;
    foreach $mod (@{$self->{_modules}}) {
	last if ($i == $final_mod_i);   # do last module in-line
#        my $pipe = ($i == $final_mod_i) ? undef : new IO::Pipe;
        my $pipe = new IO::Pipe;
	my $fred = new Fsdb::Support::Freds('dbpipeline',
	    sub {
		$prev_pipe->reader() if ($prev_pipe);
		my $in = $prev_pipe // $self->{_input};
		$mod->parse_options("--header" => $self->{_header}) if (defined($self->{_header}) && !$prev_pipe);
		$mod->parse_options("--input" => $in) if (defined($in));
		$pipe->writer() if ($pipe);
#	        $mod->parse_options("--output" => ($pipe ? $pipe : $self->{_output}));
		$mod->parse_options("--output" => $pipe);
		$mod->setup_run_finish();
		exit 0;
	    });
	if ($fred->error()) {
	    $pipe->close;
	    $err = $fred->error();
	    last;
	};
	# parent
	push (@{$self->{_freds}}, $fred);
	$i++;
	$prev_pipe = $pipe;
	$prev_mod = $mod;
    };
    if ($err) {
	$self->_reap();
	return;
    }; 
    # start last one in this process
    $prev_pipe->reader() if ($prev_pipe);
    my $final_mod = $self->{_modules}[$#{$self->{_modules}}];
    $final_mod->parse_options("--input" => ($prev_pipe ? $prev_pipe : $self->{_input}));
    $final_mod->parse_options("--output" => $self->{_output})
	if (defined($self->{_output}));
    $final_mod->setup();
}

=head2 run

    $filter->run();

Internal: run over all IO

=cut
sub run ($) {
    my($self) = @_;
    $self->{_modules}[$#{$self->{_modules}}]->run();
    $self->_reap();
}

=head2 finish

    $filter->finish();

Internal: we would write a trailer, but we don't because
we depend on the last command in the pipeline to do that.
We don't actually have a valid output stream.

=cut
sub finish ($) {
    my($self) = @_;
    $self->{_modules}[$#{$self->{_modules}}]->finish();
#    $self->SUPER::finish();
}

=head1 AUTHOR and COPYRIGHT

Copyright (C) 1991-2018 by John Heidemann <johnh@isi.edu>

This program is distributed under terms of the GNU general
public license, version 2.  See the file COPYING
with the distribution for details.

=cut

1;
GrazzMean Shell