Uname: Linux web3.us.cloudlogin.co 5.10.226-xeon-hst #2 SMP Fri Sep 13 12:28:44 UTC 2024 x86_64
Software: Apache
PHP version: 8.1.31 [ PHP INFO ] PHP os: Linux
Server Ip: 162.210.96.117
Your Ip: 3.135.234.164
User: edustar (269686) | Group: tty (888)
Safe Mode: OFF
Disable Function:
NONE

name : dbcolhisto.pm
#!/usr/bin/perl -w

#
# dbcolhisto.pm
# Copyright (C) 1997-2022 by John Heidemann <johnh@isi.edu>
#
# This program is distributed under terms of the GNU general
# public license, version 2.  See the file COPYING
# in $dblibdir for details.
#

package Fsdb::Filter::dbcolhisto;

=head1 NAME

dbcolhisto - compute a histogram over a column of Fsdb data

=head1 SYNOPSIS

dbcolhisto [-ag] [-W BucketWidth] [-S BucketStart] [-E BucketEnd] [-N NumberOfBuckets] column

=head1 DESCRIPTION

This program computes a histogram over a column of data.
Records containing non-numeric data are considered null
do not contribute to the stats (optionally they are treated as zeros).

Defaults to 10 buckets over the exact range of data.
Up to three parameters (number of buckets, start, end, and width)
can be specified, the rest default accordingly.

Buckets range from a value (given the the low column) to just below
the next low value and buckets are equal width.
If necessary, extra "<min" and ">max" buckets are created.
By default, the last bucket includes max (and is thus infinitesimally 
larger than the other buckets).  This irregularity can be removed
with the C<-I> option.

This program requires O(number of buckets) memory
and O(size of data) temporary disk space.


=head1 OPTIONS

=over 4

=item B<-W> or B<--width> N

Gives with width of each bucket, in data units.
Default is whatever gives 10 buckets over the whole range of data.

=item B<-S> or B<--start> N

Buckets start at value N, in data units.
Default is the minimum data value.

=item B<-E> or B<--end> N

Buckets end at value N, in data units.
Default is the maximum data value.

=item B<-N> or B<--number> N

Create N buckets.
The default is 10 buckets.

=item B<-g> or B<--graphical>

Generate a graphical histogram (with asterisks).
Default is numeric.

=item B<-I> or B<--last-inclusive>

Make the last bucket non-inclusive of the last value.

=item B<-a>

Compute stats over all records (treat non-numeric records
as zero rather than just ignoring them).
Default is non-numeric records are ignored.

=item B<-e> EmptyValue or B<--empty>

Specify the value any null bins get.  (Default: -.)

=back

=for comment
begin_standard_fsdb_options

This module also supports the standard fsdb options:

=over 4

=item B<-d>

Enable debugging output.

=item B<-i> or B<--input> InputSource

Read from InputSource, typically a file name, or C<-> for standard input,
or (if in Perl) a IO::Handle, Fsdb::IO or Fsdb::BoundedQueue objects.

=item B<-o> or B<--output> OutputDestination

Write to OutputDestination, typically a file name, or C<-> for standard output,
or (if in Perl) a IO::Handle, Fsdb::IO or Fsdb::BoundedQueue objects.

=item B<--autorun> or B<--noautorun>

By default, programs process automatically,
but Fsdb::Filter objects in Perl do not run until you invoke
the run() method.
The C<--(no)autorun> option controls that behavior within Perl.

=item B<--help>

Show help.

=item B<--man>

Show full manual.

=back

=for comment
end_standard_fsdb_options


=head1 SAMPLE USAGE

=head2 Input:

    #fsdb name id test1
    a 1 80
    b 2 70
    c 3 65
    d 4 90
    e 5 70
    f 6 90

=head2 Command:

    cat DATA/grades.fsdb | dbcolhisto -S 0 -E 100 -N 10 test1

=head2 Output:

    #fsdb low histogram:q
    0       0
    10      0
    20      0
    30      0
    40      0
    50      0
    60      1
    70      2
    80      1
    90      2
    #  | dbcolhisto -S 0 -E 100 -N 10 test1


=head1 SEE ALSO

L<Fsdb>,
L<dbcolpercentile>,
L<dbcolstats>

=head1 BUGS

This program could run in constant memory with no external storage
when the buckets are pre-specified.  That optimization is not implemented.


=head1 CLASS FUNCTIONS

=cut

@ISA = qw(Fsdb::Filter);
($VERSION) = 2.0;

use strict;
use Pod::Usage;
use Carp;

use Fsdb::Filter;
use Fsdb::IO::Reader;
use Fsdb::IO::Writer;
use Fsdb::IO::Replayable;
use Fsdb::Support qw($is_numeric_regexp);


=head2 new

    $filter = new Fsdb::Filter::dbcolhisto(@arguments);

Create a new dbcolhisto object, taking command-line arguments.

=cut

sub new ($@) {
    my $class = shift @_;
    my $self = $class->SUPER::new(@_);
    bless $self, $class;
    $self->set_defaults;
    $self->parse_options(@_);
    $self->SUPER::post_new();
    return $self;
}


=head2 set_defaults

    $filter->set_defaults();

Internal: set up defaults.

=cut

sub set_defaults ($) {
    my($self) = @_;
    $self->SUPER::set_defaults();
    $self->{_bucket_width} = undef;
    $self->{_bucket_start} = undef;
    $self->{_bucket_end} = undef;
    $self->{_bucket_count} = undef;
    $self->{_graphical_output} = undef;
    $self->{_last_inclusive} = 1;
    $self->{_include_non_numeric} = undef;
    $self->{_fscode} = undef;
}

=head2 parse_options

    $filter->parse_options(@ARGV);

Internal: parse command-line arguments.

=cut

sub parse_options ($@) {
    my $self = shift @_;

    my(@argv) = @_;
    $self->get_options(
	\@argv,
 	'help|?' => sub { pod2usage(1); },
	'man' => sub { pod2usage(-verbose => 2); },
	'autorun!' => \$self->{_autorun},
	'close!' => \$self->{_close},
	'a|include-non-numeric!' => \$self->{_include_non_numeric},
	'd|debug+' => \$self->{_debug},
	'e|empty=s' => \$self->{_empty},
	'E|end=f' => \$self->{_bucket_end},
	'F|fs|cs|fieldseparator|columnseparator=s' => \$self->{_fscode},
	'g|graphical!' => \$self->{_graphical_output},
	'i|input=s' => sub { $self->parse_io_option('input', @_); },
	'I|last-inclusive!' => \$self->{_last_inclusive},
	'log!' => \$self->{_logprog},
	'N|number=i' => \$self->{_bucket_count},
	'o|output=s' => sub { $self->parse_io_option('output', @_); },
	'S|start=f' => \$self->{_bucket_start},
	'W|width=f' => \$self->{_bucket_width},
	) or pod2usage(2);
    $self->parse_target_column(\@argv);
}

=head2 setup

    $filter->setup();

Internal: setup, parse headers.

=cut

sub setup ($) {
    my($self) = @_;

    pod2usage(2) if (!defined($self->{_target_column}));

    $self->finish_io_option('input', -comment_handler => $self->create_delay_comments_sub);
    $self->{_target_coli} = $self->{_in}->col_to_i($self->{_target_column});
    # propagate an integral type, if any
    my($target_type) = $self->{_in}->col_to_type($self->{_target_column}) // 'd';

    my(@cols) = ('low:' . $target_type, 'histogram' . ($self->{_graphical_output} ? '' : ':q'));
    my @output_options = (-cols => \@cols);
    unshift (@output_options, -fscode => $self->{_fscode})
	if (defined($self->{_fscode}));
    $self->finish_io_option('output', @output_options);
}

=head2 run

    $filter->run();

Internal: run over each rows.

=cut
sub run ($) {
    my($self) = @_;
    my $read_fastpath_sub = $self->{_in}->fastpath_sub();
    my $write_fastpath_sub = $self->{_out}->fastpath_sub();

    #
    # scan the data to find min/max/n
    # (We could just invoke dbcolstats, but that's overkill.)
    #
    my($min, $max);
    my($n) = 0;

    my $replayable = new Fsdb::IO::Replayable(-writer_args => [ -cols => [qw(x)] ]);
    my $replayable_writer = $replayable->writer;
    my $replayable_writer_fastpath_sub = $replayable_writer->fastpath_sub();
    my $fref;
    my($xf) = $self->{_target_coli};
    my @of;
    my $x;
    while ($fref = &$read_fastpath_sub()) {
	$x = $fref->[$xf];
	if (!$self->{_include_non_numeric}) {
	    next if ($x !~ /$is_numeric_regexp/);
	};
	$min = $x if (!defined($min) || $x < $min);
	$max = $x if (!defined($max) || $x > $max);
	$n++;
	$of[0] = $x;
	&$replayable_writer_fastpath_sub(\@of);
    };
    $replayable->close;

    #
    # sanity check
    #
    if ($n == 0) {
	croak($self->{_prog} . ": histogram impossible with no input\n");
    } elsif ($n == 1) {
	croak($self->{_prog} . ": histogram impossible with singleton input\n");
    };

    #
    # Figure out bucket parameters.
    # Yuck.  Constraint solving in Perl.
    #
    my $bucket_start = $self->{_bucket_start};
    my $bucket_end = $self->{_bucket_end};
    my $bucket_width = $self->{_bucket_width};
    my $bucket_count = $self->{_bucket_count};
    my($n_defined) = 
        (defined($bucket_start) ? 1 : 0) +
        (defined($bucket_end) ? 1 : 0) +
        (defined($bucket_width) ? 1 : 0) +
        (defined($bucket_count) ? 1 : 0);
    if ($n_defined >= 4) {
        croak($self->{_prog} . ": parameters over-specified.\n");
    } elsif ($n_defined == 3) {
        # fall through, clean up handles it.
    } elsif ($n_defined == 2) {
        if (defined($bucket_start) && defined($bucket_end)) {
    	    $bucket_count = 10;
        } elsif (defined($bucket_start) && defined($bucket_width)) {
    	    $bucket_count = 10;
        } elsif (defined($bucket_start) && defined($bucket_count)) {
    	    $bucket_end = $max;
        } elsif (defined($bucket_end) && defined($bucket_width)) {
    	    $bucket_count = 10;
        } elsif (defined($bucket_end) && defined($bucket_count)) {
    	    $bucket_start = $min;
        } elsif (defined($bucket_width) && defined($bucket_count)) {
    	    my($mid) = ($max - $min) / 2 + $min;
    	    $bucket_start = $mid - $bucket_width * $bucket_count / 2;
        } else {
    	    croak("internal error\n");
        };
        # Figure the rest out below.
    } elsif ($n_defined == 1) {
        if (defined($bucket_start)) {
    	    $bucket_end = $max;
    	    $bucket_count = 10;
        } elsif (defined($bucket_end)) {
    	    $bucket_start = $min;
    	    $bucket_count = 10;
        } elsif (defined($bucket_width) || defined($bucket_count)) {
    	    $bucket_start = $min;
    	    $bucket_end = $max;
        } else {
    	    croak("internal error\n");
        };
    } elsif ($n_defined < 1) {
        $bucket_start = $min;
        $bucket_end = $max;
        $bucket_count = 10;
    };
    # clean up
    croak($self->{_prog} . ": no buckets\n") if ($bucket_count <= 0);
    $bucket_start = $bucket_end - $bucket_width * $bucket_count
        if (!defined($bucket_start));
    $bucket_end = $bucket_start + $bucket_width * $bucket_count
        if (!defined($bucket_end));
    $bucket_width = ($bucket_end - $bucket_start) / $bucket_count
        if (!defined($bucket_width));
    $bucket_width = $bucket_end / $bucket_count if ($bucket_width == 0);
    $bucket_count = ($bucket_end - $bucket_start) / $bucket_width
        if (!defined($bucket_count));
    $bucket_width += 0.0;

    #    
    # Compute the histogram.
    #
    my(@buckets) = (0) x $bucket_count;
    my($low_bucket, $high_bucket) = (0, 0);

    my $replayable_reader = $replayable->reader;
    my $replayable_reader_fastpath_sub = $replayable_reader->fastpath_sub();
    while ($fref = &$replayable_reader_fastpath_sub()) {
	my $x = $fref->[0] + 0;
        my($b) = ($x - $bucket_start) / ($bucket_width);
        if ($b < 0) {
            $low_bucket++;
        } elsif ($b >= $bucket_count) {
            if (($x == $high_bucket || $b == $bucket_count) && $self->{_last_inclusive}) {
                $buckets[$bucket_count]++;
            } else {
                $high_bucket++;
            };
        } else {
            $buckets[int($b)]++;
        };
    }
    $replayable_reader->close;

    my $empty_value = $self->{_empty};
    my $format_sub = $self->{_graphical_output} ?
	sub { return ($_[0] == 0) ? $empty_value : ("*" x $_[0]); } :
	sub { return $_[0]; };

    if ($low_bucket) {
	@of = ("<" . $bucket_start, &$format_sub($low_bucket));
	&$write_fastpath_sub(\@of);
    };
    foreach (0..$#buckets) {
	@of = ($_ * $bucket_width + $bucket_start, &$format_sub($buckets[$_]));
	&$write_fastpath_sub(\@of);
    };
    if ($high_bucket) {
	my $last = $#buckets * $bucket_width + $bucket_start;
	@of = (">=" . $last, &$format_sub($high_bucket));
	&$write_fastpath_sub(\@of);
    };
}


=head1 AUTHOR and COPYRIGHT

Copyright (C) 1991-2022 by John Heidemann <johnh@isi.edu>

This program is distributed under terms of the GNU general
public license, version 2.  See the file COPYING
with the distribution for details.

=cut

1;
© 2025 GrazzMean