dwww Home | Show directory contents | Find package

#!/usr/bin/perl

# csv2xlsx: Convert csv to xlsx
#          (m)'22 Copyright H.M.Brand 2007-2023

use 5.014000;
use warnings;

our $VERSION = "1.20 - 2022-06-21";

sub usage {
    my $err = shift and select STDERR;
    print <<"EOU";
usage: csv2xlsx [options] [-o <xlsx>] [file.csv]
       csv2xlsx --help | --man | --info
       -s <sep>   use <sep>   as separator char, auto-detect, default = ','
                  The string "tab" is allowed.
       -e <esc>   use <esc>   as escape    char, auto-detect, default = '"'
                  The string "undef" is allowed.
       -q <quot>  use <quot>  as quotation char,              default = '"'
                  The string "undef" will disable quotation.
       -w <width> use <width> as default minimum column width default = 4
       -o <xlsx>  write output to file named <xlsx>, defaults
                  to input file name with .csv replaced with .xlsx
                  if from standard input, defaults to csv2xlsx.xlsx
       -F         allow formula's. Otherwise fields starting with
                  an equal sign are forced to string
         --Fa=aaa Define formula action: none/die/croak/diag/empty/undef
         --Ft     Formula's will be stored as text (formula actions: none)
         --Fd     Formula's will cause a die
         --Fc     Formula's will cause a croak
         --FD     Formula's will cause a warning (this is the default)
         --Fe     Formula's will be replaced by the empty string
         --Fu     Formula's will be replaced with an undefined cell
       -f         force usage of <xlsx> if already exists (unlink before use)
       -d <dtfmt> use <dtfmt> as date formats.   Default = 'dd-mm-yyyy'
       -C <C:fmt> use <fmt> as currency formats for currency <C>, no default
       -D cols    only convert dates in columns <cols>.
                  Default is everywhere. -D0 is disable
       -L N       limit export to N rows
       -u         CSV is UTF8
         --de     Some CSV fields might be double-encoded. Try to fix that.
       -m         merge multiple CSV's into a single xlsx (separate sheets)
                    -o is required, all arguments should be existing files
       -S <cp>    Split CSV on COLUMNxPAT into separate sheets. See --man
                  or --info for options/features and examples. May repeat.
         --sl=C   Use column C as sheet label when splitting with -S
       -v [<lvl>] verbosity (default = 1)
EOU
    exit $err;
    } # usage

use Text::CSV_XS;
use Excel::Writer::XLSX;
use List::Util   qw( first                       );
use Date::Calc   qw( Delta_Days Days_in_Month    );
use Encode       qw( from_to                     );
use Getopt::Long qw(:config bundling passthrough );
my $quo = '"';
my $esc = '"';
my $wdt = 4;            # Default minimal column width
my $fac = "diag";       # Formula action (default is warn only)
my $dtf = "dd-mm-yyyy"; # Date format
my $crf = "";           # Currency format, e.g.: $:### ### ##0.00
my $opt_v = 1;
my $dtc;

GetOptions (
    "V|version"         => sub { say $0 =~ s{.*/}{}r, " [$VERSION]"; exit 0; },
      "help|?"          => sub { usage (0); },
      "man"             => sub { pod_nroff (); },
      "info"            => sub { pod_text  (); },

    "c|s|sep=s"         => \my $sep, # Set after reading first line in attempt to auto-detect
    "q|quo=s"           => \$quo,
    "e|esc=s"           => \$esc,
    "w|width=i"         => \$wdt,
    "o|x|out=s"         => \my $xls,
    "d|date-fmt=s"      => \$dtf,
    "D|date-col=s"      => \$dtc,
    "C|curr-fmt=s"      => \$crf,
    "f|force!"          => \my $frc,
    "F|formulas!"       => \my $frm,
      "Fa=s"            => \$fac,
      "Ft"              => sub { $fac = "none";         },
      "Fd"              => sub { $fac = "die";          },
      "Fc"              => sub { $fac = "croak";        },
      "FD"              => sub { $fac = "diag";         },
      "Fe"              => sub { $fac = "empty";        },
      "Fu"              => sub { $fac = "undef";        },
    "u|utf-8|utf8!"     => \my $utf,
      "de|fix-utf8!"    => \my $dutf, # double encoded? \x{c3}\x{ab} => \x{100}
    "m|merge!"          => \my $mrg,
    "S|split=s"         => \my @split,
      "sl|split-label=s"=> \my $split_sl,
    "L|row-limit=i"     => \my $row_limit,
    "v|verbose:2"       => \$opt_v,
    ) or usage (1);

sub pod_text {
    require Pod::Text::Color;
    my $m = $ENV{NO_COLOR} ? "Pod::Text" : "Pod::Text::Color";
    my $p = $m->new ();
    open my $fh, ">", \my $out or die "Cannot generate manual: $!\n";
    $p->parse_from_file ($0, $fh);
    close $fh;
    print $out;
    exit 0;
    } # pod_text

sub pod_nroff {
    first { -x "$_/nroff" } grep { -d } split m/:+/ => $ENV{PATH} or pod_text ();

    require Pod::Man;
    my $p = Pod::Man->new ();
    open my $fh, "|-", "nroff", "-man" or die "Cannot generate manual: $!\n";
    $p->parse_from_file ($0, $fh);
    close $fh;
    exit 0;
    } # pod_nroff

if ($mrg) {
    my @csv;
    for (@ARGV) {
        if (m/\.xlsx?$/i) {
            $xls and usage (1);
            $xls = $_;
            next;
            }
        if (m/\.(?:csv|png|jpe?g|bmp|gif|tiff|xpm)$/i && -s) {
            push @csv => $_;
            next;
            }
        warn "Argument $_ is not an existing (CSV) file\n";
        usage (1);
        }
    $xls && @csv or usage (1);
    @ARGV = @csv;
    }

sub col2col {
    my $l = shift;
    $l =~ m/^[0-9]/ and return $l;
    my $c = 0;
    while ($l =~ s/^([A-Za-z])//) {
        $c = 26 * $c + 1 + ord (uc $1) - ord ("A");
        }
    $c;
    } # col2col

foreach my $split (@split) {
    my ($col, $operator, $pat) = ($split =~ m{^
        ([0-9]+|[A-Z]+|[a-z]+)  # Column: A, AB, 1, 14
        ([=/uU<>])              # Operator
        (.*)                    # Pattern/string
        \z}x) or usage (1);
    my $case = $col =~ m/^[a-z]/ ? 1 : 0;
    $split = {
        col => col2col ($col),
        op  => $operator,
        str => $pat,
        ic  => $case,
        lbl => undef,
        };
    if ($split_sl) {
        ($col) = ($split_sl =~ m/^([0-9]+|[A-Z]+)$/) or usage (1);
        $split->{lbl} = col2col ($col);
        }
    }

my $base = @ARGV && -f $ARGV[0] ? $ARGV[0] : "csv2xlsx";
$xls ||= $base =~ s/(?:\.csv)?$/.xlsx/ir;

-s $xls && $frc and unlink $xls;
if (-s $xls) {
    print STDERR "File '$xls' already exists. Overwrite? [y/N] > N\b";
    scalar <STDIN> =~ m/^[yj](?:es|a)?$/i or exit;
    }
# Test if the file can be created
# The error from Excel::Writer::XLSX is not very informative if it fails
# e.g. when the folder cannot be used for whatever reason
{   open my $fh, ">", $xls or die "$xls: $!\n";
    close $fh;
    unlink $xls;
    }

if ($dutf) {
    eval { require Encode::DoubleEncodedUTF8; };
    if ($@) {
        $dutf = 0;
        warn "Cannot load Encode::DoubleEncodedUTF8; --de ignored\n";
        }
    }

my $wbk = Excel::Writer::XLSX->new ($xls);
   $dtf =~ s/j/y/g;
my %fmt = (
    date => $wbk->add_format (align => "center", num_format => $dtf),
    rest => $wbk->add_format (align => "left"),
    wrap => $wbk->add_format (text_wrap => 1),
    );
$crf =~ s/^([^:]+):(.*)/$1/ and $fmt{currency} = $wbk->add_format (
    num_format  => "$1 $2",
    align       => "right",
    );

my @args = @ARGV ? @ARGV : ("");
foreach my $csvf (@args) {
    my $sheetname = $csvf =~ s{\.\w+$}{}ir =~ s{.*/}{}r || "Sheet 1";
    ($_ = length $sheetname) > 31 and substr $sheetname, 31, $_ - 31, "";
    my ($wks, $w, $h);

    $opt_v > 7 and warn "Parsing $csvf into $xls.$sheetname ...\n";

    if ($csvf =~ m/\.(png|jpe?g|bmp|gif|tiff|xpm)$/i) {
        $wks = $wbk->add_worksheet ($sheetname);
        $wks->insert_image (1, 1, $csvf);
        next;
        }

    my $row;
    my $firstline;
    my $fh;
    if (-f $csvf) {
        $opt_v and say "Reading $csvf";
        open $fh, "<", $csvf or die "$csvf: $!\n";
        }
    else {
        $opt_v and say "Reading STDIN";
        $fh = *ARGV;
        }
    my $Sep = $sep;
    unless ($Sep) { # No sep char passed, try to auto-detect;
        while (<$fh>) {
            m/\S/ or next;      # Skip empty leading blank lines
            $Sep = # start auto-detect with quoted strings
                   m/["\d];["\d;]/    ? ";"  :
                   m/["\d],["\d,]/    ? ","  :
                   m/["\d]\t["\d,]/   ? "\t" :
                   # If neither, then for unquoted strings
                   m/\w;[\w;]/        ? ";"  :
                   m/\w,[\w,]/        ? ","  :
                   m/\w\t[\w,]/       ? "\t" :
                   # And pipes (lowest prio)
                   m/["\d]\|["\d,]/   ? "|"  :
                   m/\w\|[\w,]/       ? "|"  :
                   m/,/  && !m/[;\t]/ ? ","  :
                   m/;/  && !m/[,\t]/ ? ";"  :
                   m/\t/ && !m/[,;]/  ? "\t" :
                                        ","  ;
            $firstline = $_;
            last;
            }
        $firstline or die "The sourcefile does not contain any usable data\n";
        }
    my $csv = Text::CSV_XS-> new ({
        sep_char       => $Sep eq "tab"   ? "\t"  : $Sep,
        quote_char     => $quo eq "undef" ? undef : $quo,
        escape_char    => $esc eq "undef" ? undef : $esc,
        binary         => 1,
        keep_meta_info => 1,
        auto_diag      => 1,
        formula        => $fac,
        });
    if ($firstline) {
        $csv->parse ($firstline) or die $csv->error_diag ();
        $row = [ $csv->fields ];
        }
    if ($opt_v > 3) {
        foreach my $k (qw( sep_char quote_char escape_char )) {
            my $c = $csv->$k () || "undef";
            $c =~ s/\t/\\t/g;
            $c =~ s/\r/\\r/g;
            $c =~ s/\n/\\n/g;
            $c =~ s/\0/\\0/g;
            $c =~ s/([\x00-\x1f\x7f-\xff])/sprintf"\\x{%02x}",ord$1/ge;
            printf STDERR "%-11s = %s\n", $k, $c;
            }
        }

    if (length $dtc) {
        if ($dtc eq "0") {
            $dtc = { -2 => 0 };
            }
        else {
            my $rows = $dtc;
            $rows =~ s/-$/-999/;                        # 3,6-
            $rows =~ s/-/../g;
            eval "\$dtc = { map { \$_ => 1 } $rows }";
            }
        }

    my @w; # data height, -width, and default column widths
    while ($row && @$row or $row = $csv->getline ($fh)) {
        $row_limit and $csv->record_number > $row_limit and last;
        my @row = @$row;

        $opt_v > 8 and warn "@row\n";
        if (@split) {
            my $ns = 0;
            foreach my $split (@split) {
                my ($i, $op, $str, $case) = @{$split}{qw( col op str ic )};
                my $v = $row[$i - 1]; # Column-index is one-based
                if (defined $v) {
                    $op eq "U"          and $ns += 1;
                    if ($case) {
                        $op eq "="      and $ns += ( lc $v eq lc $v     );
                        $op eq "/"      and $ns += ( $v =~ m/$str/i     );
                        }
                    else {
                        $op eq "="      and $ns += ( $v eq   $str       );
                        $op eq "/"      and $ns += ( $v =~ m/$str/      );
                        if ($v =~ m/^[0-9]+$/) {
                            $op eq "<"  and $ns += ( $v <  $str );
                            $op eq ">"  and $ns += ( $v >  $str );
                            }
                        else {
                            $op eq "<"  and $ns += ( $v lt $str );
                            $op eq ">"  and $ns += ( $v gt $str );
                            }
                        }
                    }
                else {
                    $op eq "u"          and $ns += 1;
                    }
                }

            $opt_v > 8 and warn join " " => "Record", $csv->record_number,
                "matched", $ns, "out of", scalar @split, "criteria\n";
            if ($ns == @split) {        # All criteria PASS
                if (@w) {
                    $wks->set_column ($_, $_, $w[$_]) for 0 .. $#w;
                    $wks = undef;
                    }
                $split[0]{lbl} and $sheetname = $row[$split[0]{lbl} - 1];
                }
            }

        unless ($wks) {
            $wks = $wbk->add_worksheet ($sheetname);
            $utf && !$wks->can ("write_unicode") and $utf = 0;
            ($h, $w, @w) = (0, 1);
            }

        @row > $w and push @w => ($wdt) x (($w = @row) - @w);
        foreach my $c (0 .. $#row) {
            my $val = $row[$c] // "";
            my $l   = length $val;
            $l > ($w[$c] // -1) and $w[$c] = $l;

            $dutf and $csv->is_binary ($c) and utf8::valid ($val) and
                $val = Encode::decode ("utf-8-de", $val);

            if ($utf and $csv->is_binary ($c)) {
                from_to ($val, "utf-8", "ucs2");
                $wks->write_unicode ($h, $c, $val);
                next;
                }

            if ($csv->is_quoted ($c)) {
                $val =~ s/\r\n/\n/g;
                if ($utf) {
                    from_to ($val, "utf-8", "ucs2");
                    $val =~ m/\n/
                        ? $wks->write_unicode ($h, $c, $val, $fmt{wrap})
                        : $wks->write_unicode ($h, $c, $val);
                    }
                else {
                    $val =~ m/\n/
                        ? $wks->write_string  ($h, $c, $val, $fmt{wrap})
                        : $wks->write_string  ($h, $c, $val);
                    }
                next;
                }

            if (!$dtc or $dtc->{$c + 1}) {
                my @d = (0, 0, 0);      # Y, M, D
                $val =~ m/^(\d{4})(\d{2})(\d{2})$/   and @d = ($1, $2, $3);
                $val =~ m/^(\d{2})-(\d{2})-(\d{4})$/ and @d = ($3, $2, $1);
                if ( $d[2] >=    1 && $d[2] <=   31 &&
                     $d[1] >=    1 && $d[1] <=   12 &&
                     $d[0] >= 1900 && $d[0] <= 2199) {
                    my $dm = Days_in_Month (@d[0,1]);
                    $d[2] <   1 and $d[2] = 1;
                    $d[2] > $dm and $d[2] = $dm;
                    my $dt = 2 + Delta_Days (1900, 1, 1, @d);
                    $wks->write ($h, $c, $dt, $fmt{date});
                    next;
                    }
                }
            if ($crf and $val =~ m/^\s*\Q$crf\E\s*([0-9.]+)$/) {
                $wks->write ($h, $c, $1 + 0, $fmt{currency});
                next;
                }

            if (!$frm && $val =~ m/^=/) {
                $wks->write_string  ($h, $c, $val);
                }
            else {
                $wks->write ($h, $c, $val);
                }
            }
        ++$h % 100 or $opt_v && printf STDERR "%6d x %6d\r", $w, $h;
        } continue { $row = undef }
    close $fh;
    $opt_v && printf STDERR "%6d x %6d\n", $w, $h;

    $wks->set_column ($_, $_, $w[$_]) for 0 .. $#w;
    }
$opt_v and say "Writing $xls";
$wbk->close ();

__END__

=head1 NAME

csv2xlsx - Convert CSV to Excel 2007+

=head1 SYNOPSIS

 csv2xlsx [options] [file.csv]

 csv2xlsx test.csv

 csv2xlsx -f -o merged.xlsx -m foobar*.csv

=head1 DESCRIPTION

This tool converts CSV data to Excel-2007+. It can convert the CSV into a
single sheet, merge CSV files into multiple sheets in a workbook or split
a single CSV into multiple sheets.

The tool supports encoding, formula handle, date conversion and some more.

=head1 OPTIONS

=over 2

=item -s S

=item --sep=S

use C<S> as separator character, auto-detect, default = C<,>.

The literal string C<tab> is allowed.

=item -q Q

=item --quo=Q

use C<Q> as quotation character, auto-detect, default = C<">.

The literal string C<undef> is allowed to disable quotation.

=item -e E

=item --esc=E

use C<E> as escape character, auto-detect, default = C<">.

The literal string C<undef> is allowed to disable escapes.

=item -w W

=item --width=W

Set column width. Default is to auto-size per column per sheet with a
minimum width of C<4>.

=item -o FILE.xlsx

=item -x FILE.xlsx

=item --out=FILE.xlsx

Specify the output filename. Default it the same name as the input-file
where the trailing C<.csv> will be replaced with C<.xlsx>.

If a filename can not be automatically generated, it will default to
C<csv2xlsx.xlsx>.

=item -d format

=item --date-fmt=format

Use C<format> as date formats. Default = C<dd-mm-yyyy>.

=item -D range

=item --date-col=range

Only convert dates in columns C<range>. Default is everywhere.

Ranges are numeric, where the first column has index C<1>. A range is a
definition of sections joined by C<,>s. A section is either a single column
or a start/finish-pair joined with a dash. A missing finish index on the
last segment is an open range (till the end of the line).

If the range is C<0>, date conversion is disabled.

 -D 0
 -D 1,2,6
 -D 2,4-8,12-

=item -C format

=item --curr-fmt=format

Use C<format> as currency formats for currency <C>, no default.

 -C '$:### ### ##0.00'

=item -f

=item --force

Force usage of the output file if it already exists (unlink before use).

=item -F

=item --formulas

Allow formula's. Otherwise fields starting with an equal sign C<=> are
forced to string type.

There are several shortcuts here to specify different behavior:

=over 2

=item --Fa=a

Define formula action: C<none>/C<die>/C<croak>/C<diag>/C<empty>/C<undef>

=item --Ft

Formula's will be stored as text (formula actions: none)

=item --Fd

Formula's will cause a die

=item --Fc

Formula's will cause a croak

=item --FD

Formula's will cause a warning (this is the default)

=item --Fe

Formula's will be replaced by the empty string

=item --Fu

Formula's will be replaced with an undefined cell

=back

=item -u

=item --utf-8

=item --utf8

CSV is UTF-8 encoded. Likely not needed, as most is auto-detected.

=item --de

=item --fix-utf8

Attempt to fix double-encoded UTF-8. e.g. C<\x{00c3}\x{00ab}> should have
been C<\x0100}>. YMMV.

=item -m

=item --merge

Merge multiple CSV's into a single Excel (separate sheets).

With this option, the option C<-o> is required. All arguments should be
existing files. Piping is not supported.

=item -L N

=item --row-limit=N

Limit export to C<N> rows.

=item -S CxP

=item --split=CxP

When dealing with big CSV datasets, this option enables you to split the
data over several sheets. When all the C<-S> options match in a single row,
that row will be the first row of a new sheet. (see also C<--sl=C>)

=over 2

=item C

The column that should be examined. C<A> = C<1>. If lower case, the value
of that column is matched case insensitive when appropriate.

 -S 7=ab      Column G is literal "ab"
 -S G=ab      Column G is literal "ab"
 -S g=ab      Column G is literal "ab", or "aB", or "Ab", or "AB"

=item x

Defines the operation on the column

=over 2

=item =

Literal match

 -S G=ab      Column G is literal "ab"
 -S g=ab      Column G is literal "ab", or "aB", or "Ab", or "AB"

=item /

Regex match

 -S G/b[a-z]  Column G matches /b[a-z]/
 -S g/b[a-z]  Column G matches /b[a-z]/i

=item u U

Check for defined

 -S Gu        Column G is undefined
 -S GU        Column G is defined

Similar for emptiness

 -S G=        Column G is defined but empty
 -S G/.       Column G is defined and not empty

=item < >

Compare. If the value in the column is defined compare. If the values only
holds digits, do a numeric compare, otherwise do a string compare.

 -S G<42      Column G is defined and less than 42 (numeric)
 -S G<ab      Column G is defined and less than 42 (string)

 -S G>42      Column G is defined and greater than 42 (numeric)
 -S G>ab      Column G is defined and greater than 42 (string)

=back

=item P

pattern or literal string. Quotation might be required differing per OS and
shell-environment.

=back

=item --sl=C

When splitting with C<-S> / C<--split>, if all criteria match and a new
sheet is to be created, use the value in column C<C> of the matching row
as the new sheet label.

=item -v [V]

=item --verbose[=V]

Set verbosity level. Default = 1. No argument will set to 2.

=back

=head1 SEE ALSO

csv2xls - Convert CSV to old Excel

=head1 AUTHOR

H.Merijn Brand

=head1 COPYRIGHT AND LICENSE

 Copyright (C) 2016-2023 H.Merijn Brand.  All rights reserved.

This library is free software;  you can redistribute and/or modify it under
the same terms as Perl itself.

=cut

Generated by dwww version 1.15 on Sun Jun 30 10:42:11 CEST 2024.