#!/usr/bin/perl -w
#$Id: map,v 1.20 1998/02/11 23:58:27 schwartz Exp $
#
# map - convert a text file to a different character set
#
# See also usage() of this file. General information at:
# http://wwwwbs.cs.tu-berlin.de/~schwartz/pmh/index.html
#
# Copyright (C) 1998 Martin Schwartz. All rights reserved.
# This program is free software; you can redistribute it and/or
# modify it under the same terms as Perl itself.
#
# Contact: Martin Schwartz <martin@nacho.de>
#
my $PROGNAME = "map";
my $VERSION = "1.21";
my $DATE = "2000-Jun-26";
use Getopt::Long;
use Unicode::Map;
my %opt = ();
my $defaultCsId = "ISO-8859-1";
main: {
$|=1; undef $/;
GetOptions ( \%opt, "from=s", "help", "list", "to=s" );
usage() if $opt{"help"};
my $error = 0;
if ( $opt{"list"} ) {
$error = list_csids ( );
} else {
if ( !$opt{"to"} && !$opt{"from"} ) {
usage ( );
}
$opt{"from"} ||= $defaultCsId;
$opt{"to"} ||= $defaultCsId;
$error = handle_stream ( );
}
exit $error;
}
sub handle_stream {
undef $/;
my $input = <STDIN>;
my ( $unicode, $output, $csid );
$csid = $opt { "from" };
if ( $csid =~ /^unicode$/i) {
$unicode = $input;
} else {
my $MapFrom = new Unicode::Map ( $csid );
if ( !$MapFrom ) {
print "Error! Mapping \"$csid\" not available!\n";
return 0;
}
$unicode = $MapFrom -> to_unicode ( $input );
}
undef $input;
$csid = $opt{"to"};
if ( $csid =~ /^unicode$/i ) {
$output = $unicode;
} else {
my $MapTo = new Unicode::Map ( $csid );
if ( !$MapTo ) {
print "Error! Mapping \"$csid\" not available!\n";
return 0;
}
$output = $MapTo -> from_unicode ( $unicode );
}
undef $unicode;
print STDOUT $output;
1}
sub list_csids {
return 0 unless my $Map = new Unicode::Map ( );
my (@alias, $last, $s);
my $i=1;
print "Defined character sets:\n";
for ($Map->ids()) {
$s = sprintf "%02d: $_", $i++;
if (@alias = sort {$a cmp $b} $Map->alias($_)) {
$last = pop(@alias);
$s .= " (";
$s .= join(", ", @alias);
$s .= ", " if $#alias>=0;
$s .= "$last)";
}
print "$s\n";
}
print "Done.\n";
1}
sub usage {
_print_usage (
"$PROGNAME V$VERSION ($DATE) - recode from and to Unicode\n"
."usage: $PROGNAME {--option [arg]} [--from cset] || [--to cset] file(s)",
[
"from s Encoding of input files (default \"$defaultCsId\")",
"list Lists available character sets and their alias names.",
"to s Encoding of output files (default \"$defaultCsId\")",
]
);
exit 0;
}
sub _print_usage {
my ($header, $bodylistR, $footer) = @_;
print "$header\n" if $header;
print map " --$_\n", sort { lc($a) cmp lc($b) } @$bodylistR;
print "$footer\n" if $footer;
}
__END__
=head1 NAME
map - An utility to map texts from and to unicode
=head1 SYNOPSIS
map - recode from and to various character sets.
Reads from STDIN, writes to STDOUT.
usage: map [--from cset] [--to cset] < input.txt > output.txt
from s Encoding of input files (default "ISO-8859-1")
list Lists available character sets and their alias names.
to s Encoding of output files (default "ISO-8859-1")
=head1 DESCRIPTION
Maps text from one character set representation to another. This work is
actually long time very well done by C<recode>, but unfortunately recode
does not support Unicode and eastern asia character sets. But, if you have
pure 8 bit things to do, recode will still be the best solution.
Examples:
Conversion from ISO-8859-1 to Unicode:
map --to unicode < iso-8859-1.txt > unicode.txt
Conversion from GB2312 to CP936:
map --from cp936 --to GB2312 < gb2312.txt > cp936.txt
Conversion from CP850 to Unicode:
map --from cp850 --to unicode < cp850.txt > unicode.txt
=head1 SEE ALSO
recode(1), Unicode::Map(3), Unicode::Map8(3), Unicode::String(3)
=head1 AUTHOR
Martin Schwartz E<lt>F<martin@nacho.de>E<gt>.
=cut