#!/usr/bin/perl # csv2xlsx: Convert csv to xlsx # (m)'22 Copyright H.M.Brand 2007-2023 use 5.014000; use warnings; our $VERSION = "1.20 - 2022-06-21"; sub usage { my $err = shift and select STDERR; print <<"EOU"; usage: csv2xlsx [options] [-o <xlsx>] [file.csv] csv2xlsx --help | --man | --info -s <sep> use <sep> as separator char, auto-detect, default = ',' The string "tab" is allowed. -e <esc> use <esc> as escape char, auto-detect, default = '"' The string "undef" is allowed. -q <quot> use <quot> as quotation char, default = '"' The string "undef" will disable quotation. -w <width> use <width> as default minimum column width default = 4 -o <xlsx> write output to file named <xlsx>, defaults to input file name with .csv replaced with .xlsx if from standard input, defaults to csv2xlsx.xlsx -F allow formula's. Otherwise fields starting with an equal sign are forced to string --Fa=aaa Define formula action: none/die/croak/diag/empty/undef --Ft Formula's will be stored as text (formula actions: none) --Fd Formula's will cause a die --Fc Formula's will cause a croak --FD Formula's will cause a warning (this is the default) --Fe Formula's will be replaced by the empty string --Fu Formula's will be replaced with an undefined cell -f force usage of <xlsx> if already exists (unlink before use) -d <dtfmt> use <dtfmt> as date formats. Default = 'dd-mm-yyyy' -C <C:fmt> use <fmt> as currency formats for currency <C>, no default -D cols only convert dates in columns <cols>. Default is everywhere. -D0 is disable -L N limit export to N rows -u CSV is UTF8 --de Some CSV fields might be double-encoded. Try to fix that. -m merge multiple CSV's into a single xlsx (separate sheets) -o is required, all arguments should be existing files -S <cp> Split CSV on COLUMNxPAT into separate sheets. See --man or --info for options/features and examples. May repeat. --sl=C Use column C as sheet label when splitting with -S -v [<lvl>] verbosity (default = 1) EOU exit $err; } # usage use Text::CSV_XS; use Excel::Writer::XLSX; use List::Util qw( first ); use Date::Calc qw( Delta_Days Days_in_Month ); use Encode qw( from_to ); use Getopt::Long qw(:config bundling passthrough ); my $quo = '"'; my $esc = '"'; my $wdt = 4; # Default minimal column width my $fac = "diag"; # Formula action (default is warn only) my $dtf = "dd-mm-yyyy"; # Date format my $crf = ""; # Currency format, e.g.: $:### ### ##0.00 my $opt_v = 1; my $dtc; GetOptions ( "V|version" => sub { say $0 =~ s{.*/}{}r, " [$VERSION]"; exit 0; }, "help|?" => sub { usage (0); }, "man" => sub { pod_nroff (); }, "info" => sub { pod_text (); }, "c|s|sep=s" => \my $sep, # Set after reading first line in attempt to auto-detect "q|quo=s" => \$quo, "e|esc=s" => \$esc, "w|width=i" => \$wdt, "o|x|out=s" => \my $xls, "d|date-fmt=s" => \$dtf, "D|date-col=s" => \$dtc, "C|curr-fmt=s" => \$crf, "f|force!" => \my $frc, "F|formulas!" => \my $frm, "Fa=s" => \$fac, "Ft" => sub { $fac = "none"; }, "Fd" => sub { $fac = "die"; }, "Fc" => sub { $fac = "croak"; }, "FD" => sub { $fac = "diag"; }, "Fe" => sub { $fac = "empty"; }, "Fu" => sub { $fac = "undef"; }, "u|utf-8|utf8!" => \my $utf, "de|fix-utf8!" => \my $dutf, # double encoded? \x{c3}\x{ab} => \x{100} "m|merge!" => \my $mrg, "S|split=s" => \my @split, "sl|split-label=s"=> \my $split_sl, "L|row-limit=i" => \my $row_limit, "v|verbose:2" => \$opt_v, ) or usage (1); sub pod_text { require Pod::Text::Color; my $m = $ENV{NO_COLOR} ? "Pod::Text" : "Pod::Text::Color"; my $p = $m->new (); open my $fh, ">", \my $out or die "Cannot generate manual: $!\n"; $p->parse_from_file ($0, $fh); close $fh; print $out; exit 0; } # pod_text sub pod_nroff { first { -x "$_/nroff" } grep { -d } split m/:+/ => $ENV{PATH} or pod_text (); require Pod::Man; my $p = Pod::Man->new (); open my $fh, "|-", "nroff", "-man" or die "Cannot generate manual: $!\n"; $p->parse_from_file ($0, $fh); close $fh; exit 0; } # pod_nroff if ($mrg) { my @csv; for (@ARGV) { if (m/\.xlsx?$/i) { $xls and usage (1); $xls = $_; next; } if (m/\.(?:csv|png|jpe?g|bmp|gif|tiff|xpm)$/i && -s) { push @csv => $_; next; } warn "Argument $_ is not an existing (CSV) file\n"; usage (1); } $xls && @csv or usage (1); @ARGV = @csv; } sub col2col { my $l = shift; $l =~ m/^[0-9]/ and return $l; my $c = 0; while ($l =~ s/^([A-Za-z])//) { $c = 26 * $c + 1 + ord (uc $1) - ord ("A"); } $c; } # col2col foreach my $split (@split) { my ($col, $operator, $pat) = ($split =~ m{^ ([0-9]+|[A-Z]+|[a-z]+) # Column: A, AB, 1, 14 ([=/uU<>]) # Operator (.*) # Pattern/string \z}x) or usage (1); my $case = $col =~ m/^[a-z]/ ? 1 : 0; $split = { col => col2col ($col), op => $operator, str => $pat, ic => $case, lbl => undef, }; if ($split_sl) { ($col) = ($split_sl =~ m/^([0-9]+|[A-Z]+)$/) or usage (1); $split->{lbl} = col2col ($col); } } my $base = @ARGV && -f $ARGV[0] ? $ARGV[0] : "csv2xlsx"; $xls ||= $base =~ s/(?:\.csv)?$/.xlsx/ir; -s $xls && $frc and unlink $xls; if (-s $xls) { print STDERR "File '$xls' already exists. Overwrite? [y/N] > N\b"; scalar <STDIN> =~ m/^[yj](?:es|a)?$/i or exit; } # Test if the file can be created # The error from Excel::Writer::XLSX is not very informative if it fails # e.g. when the folder cannot be used for whatever reason { open my $fh, ">", $xls or die "$xls: $!\n"; close $fh; unlink $xls; } if ($dutf) { eval { require Encode::DoubleEncodedUTF8; }; if ($@) { $dutf = 0; warn "Cannot load Encode::DoubleEncodedUTF8; --de ignored\n"; } } my $wbk = Excel::Writer::XLSX->new ($xls); $dtf =~ s/j/y/g; my %fmt = ( date => $wbk->add_format (align => "center", num_format => $dtf), rest => $wbk->add_format (align => "left"), wrap => $wbk->add_format (text_wrap => 1), ); $crf =~ s/^([^:]+):(.*)/$1/ and $fmt{currency} = $wbk->add_format ( num_format => "$1 $2", align => "right", ); my @args = @ARGV ? @ARGV : (""); foreach my $csvf (@args) { my $sheetname = $csvf =~ s{\.\w+$}{}ir =~ s{.*/}{}r || "Sheet 1"; ($_ = length $sheetname) > 31 and substr $sheetname, 31, $_ - 31, ""; my ($wks, $w, $h); $opt_v > 7 and warn "Parsing $csvf into $xls.$sheetname ...\n"; if ($csvf =~ m/\.(png|jpe?g|bmp|gif|tiff|xpm)$/i) { $wks = $wbk->add_worksheet ($sheetname); $wks->insert_image (1, 1, $csvf); next; } my $row; my $firstline; my $fh; if (-f $csvf) { $opt_v and say "Reading $csvf"; open $fh, "<", $csvf or die "$csvf: $!\n"; } else { $opt_v and say "Reading STDIN"; $fh = *ARGV; } my $Sep = $sep; unless ($Sep) { # No sep char passed, try to auto-detect; while (<$fh>) { m/\S/ or next; # Skip empty leading blank lines $Sep = # start auto-detect with quoted strings m/["\d];["\d;]/ ? ";" : m/["\d],["\d,]/ ? "," : m/["\d]\t["\d,]/ ? "\t" : # If neither, then for unquoted strings m/\w;[\w;]/ ? ";" : m/\w,[\w,]/ ? "," : m/\w\t[\w,]/ ? "\t" : # And pipes (lowest prio) m/["\d]\|["\d,]/ ? "|" : m/\w\|[\w,]/ ? "|" : m/,/ && !m/[;\t]/ ? "," : m/;/ && !m/[,\t]/ ? ";" : m/\t/ && !m/[,;]/ ? "\t" : "," ; $firstline = $_; last; } $firstline or die "The sourcefile does not contain any usable data\n"; } my $csv = Text::CSV_XS-> new ({ sep_char => $Sep eq "tab" ? "\t" : $Sep, quote_char => $quo eq "undef" ? undef : $quo, escape_char => $esc eq "undef" ? undef : $esc, binary => 1, keep_meta_info => 1, auto_diag => 1, formula => $fac, }); if ($firstline) { $csv->parse ($firstline) or die $csv->error_diag (); $row = [ $csv->fields ]; } if ($opt_v > 3) { foreach my $k (qw( sep_char quote_char escape_char )) { my $c = $csv->$k () || "undef"; $c =~ s/\t/\\t/g; $c =~ s/\r/\\r/g; $c =~ s/\n/\\n/g; $c =~ s/\0/\\0/g; $c =~ s/([\x00-\x1f\x7f-\xff])/sprintf"\\x{%02x}",ord$1/ge; printf STDERR "%-11s = %s\n", $k, $c; } } if (length $dtc) { if ($dtc eq "0") { $dtc = { -2 => 0 }; } else { my $rows = $dtc; $rows =~ s/-$/-999/; # 3,6- $rows =~ s/-/../g; eval "\$dtc = { map { \$_ => 1 } $rows }"; } } my @w; # data height, -width, and default column widths while ($row && @$row or $row = $csv->getline ($fh)) { $row_limit and $csv->record_number > $row_limit and last; my @row = @$row; $opt_v > 8 and warn "@row\n"; if (@split) { my $ns = 0; foreach my $split (@split) { my ($i, $op, $str, $case) = @{$split}{qw( col op str ic )}; my $v = $row[$i - 1]; # Column-index is one-based if (defined $v) { $op eq "U" and $ns += 1; if ($case) { $op eq "=" and $ns += ( lc $v eq lc $v ); $op eq "/" and $ns += ( $v =~ m/$str/i ); } else { $op eq "=" and $ns += ( $v eq $str ); $op eq "/" and $ns += ( $v =~ m/$str/ ); if ($v =~ m/^[0-9]+$/) { $op eq "<" and $ns += ( $v < $str ); $op eq ">" and $ns += ( $v > $str ); } else { $op eq "<" and $ns += ( $v lt $str ); $op eq ">" and $ns += ( $v gt $str ); } } } else { $op eq "u" and $ns += 1; } } $opt_v > 8 and warn join " " => "Record", $csv->record_number, "matched", $ns, "out of", scalar @split, "criteria\n"; if ($ns == @split) { # All criteria PASS if (@w) { $wks->set_column ($_, $_, $w[$_]) for 0 .. $#w; $wks = undef; } $split[0]{lbl} and $sheetname = $row[$split[0]{lbl} - 1]; } } unless ($wks) { $wks = $wbk->add_worksheet ($sheetname); $utf && !$wks->can ("write_unicode") and $utf = 0; ($h, $w, @w) = (0, 1); } @row > $w and push @w => ($wdt) x (($w = @row) - @w); foreach my $c (0 .. $#row) { my $val = $row[$c] // ""; my $l = length $val; $l > ($w[$c] // -1) and $w[$c] = $l; $dutf and $csv->is_binary ($c) and utf8::valid ($val) and $val = Encode::decode ("utf-8-de", $val); if ($utf and $csv->is_binary ($c)) { from_to ($val, "utf-8", "ucs2"); $wks->write_unicode ($h, $c, $val); next; } if ($csv->is_quoted ($c)) { $val =~ s/\r\n/\n/g; if ($utf) { from_to ($val, "utf-8", "ucs2"); $val =~ m/\n/ ? $wks->write_unicode ($h, $c, $val, $fmt{wrap}) : $wks->write_unicode ($h, $c, $val); } else { $val =~ m/\n/ ? $wks->write_string ($h, $c, $val, $fmt{wrap}) : $wks->write_string ($h, $c, $val); } next; } if (!$dtc or $dtc->{$c + 1}) { my @d = (0, 0, 0); # Y, M, D $val =~ m/^(\d{4})(\d{2})(\d{2})$/ and @d = ($1, $2, $3); $val =~ m/^(\d{2})-(\d{2})-(\d{4})$/ and @d = ($3, $2, $1); if ( $d[2] >= 1 && $d[2] <= 31 && $d[1] >= 1 && $d[1] <= 12 && $d[0] >= 1900 && $d[0] <= 2199) { my $dm = Days_in_Month (@d[0,1]); $d[2] < 1 and $d[2] = 1; $d[2] > $dm and $d[2] = $dm; my $dt = 2 + Delta_Days (1900, 1, 1, @d); $wks->write ($h, $c, $dt, $fmt{date}); next; } } if ($crf and $val =~ m/^\s*\Q$crf\E\s*([0-9.]+)$/) { $wks->write ($h, $c, $1 + 0, $fmt{currency}); next; } if (!$frm && $val =~ m/^=/) { $wks->write_string ($h, $c, $val); } else { $wks->write ($h, $c, $val); } } ++$h % 100 or $opt_v && printf STDERR "%6d x %6d\r", $w, $h; } continue { $row = undef } close $fh; $opt_v && printf STDERR "%6d x %6d\n", $w, $h; $wks->set_column ($_, $_, $w[$_]) for 0 .. $#w; } $opt_v and say "Writing $xls"; $wbk->close (); __END__ =head1 NAME csv2xlsx - Convert CSV to Excel 2007+ =head1 SYNOPSIS csv2xlsx [options] [file.csv] csv2xlsx test.csv csv2xlsx -f -o merged.xlsx -m foobar*.csv =head1 DESCRIPTION This tool converts CSV data to Excel-2007+. It can convert the CSV into a single sheet, merge CSV files into multiple sheets in a workbook or split a single CSV into multiple sheets. The tool supports encoding, formula handle, date conversion and some more. =head1 OPTIONS =over 2 =item -s S =item --sep=S use C<S> as separator character, auto-detect, default = C<,>. The literal string C<tab> is allowed. =item -q Q =item --quo=Q use C<Q> as quotation character, auto-detect, default = C<">. The literal string C<undef> is allowed to disable quotation. =item -e E =item --esc=E use C<E> as escape character, auto-detect, default = C<">. The literal string C<undef> is allowed to disable escapes. =item -w W =item --width=W Set column width. Default is to auto-size per column per sheet with a minimum width of C<4>. =item -o FILE.xlsx =item -x FILE.xlsx =item --out=FILE.xlsx Specify the output filename. Default it the same name as the input-file where the trailing C<.csv> will be replaced with C<.xlsx>. If a filename can not be automatically generated, it will default to C<csv2xlsx.xlsx>. =item -d format =item --date-fmt=format Use C<format> as date formats. Default = C<dd-mm-yyyy>. =item -D range =item --date-col=range Only convert dates in columns C<range>. Default is everywhere. Ranges are numeric, where the first column has index C<1>. A range is a definition of sections joined by C<,>s. A section is either a single column or a start/finish-pair joined with a dash. A missing finish index on the last segment is an open range (till the end of the line). If the range is C<0>, date conversion is disabled. -D 0 -D 1,2,6 -D 2,4-8,12- =item -C format =item --curr-fmt=format Use C<format> as currency formats for currency <C>, no default. -C '$:### ### ##0.00' =item -f =item --force Force usage of the output file if it already exists (unlink before use). =item -F =item --formulas Allow formula's. Otherwise fields starting with an equal sign C<=> are forced to string type. There are several shortcuts here to specify different behavior: =over 2 =item --Fa=a Define formula action: C<none>/C<die>/C<croak>/C<diag>/C<empty>/C<undef> =item --Ft Formula's will be stored as text (formula actions: none) =item --Fd Formula's will cause a die =item --Fc Formula's will cause a croak =item --FD Formula's will cause a warning (this is the default) =item --Fe Formula's will be replaced by the empty string =item --Fu Formula's will be replaced with an undefined cell =back =item -u =item --utf-8 =item --utf8 CSV is UTF-8 encoded. Likely not needed, as most is auto-detected. =item --de =item --fix-utf8 Attempt to fix double-encoded UTF-8. e.g. C<\x{00c3}\x{00ab}> should have been C<\x0100}>. YMMV. =item -m =item --merge Merge multiple CSV's into a single Excel (separate sheets). With this option, the option C<-o> is required. All arguments should be existing files. Piping is not supported. =item -L N =item --row-limit=N Limit export to C<N> rows. =item -S CxP =item --split=CxP When dealing with big CSV datasets, this option enables you to split the data over several sheets. When all the C<-S> options match in a single row, that row will be the first row of a new sheet. (see also C<--sl=C>) =over 2 =item C The column that should be examined. C<A> = C<1>. If lower case, the value of that column is matched case insensitive when appropriate. -S 7=ab Column G is literal "ab" -S G=ab Column G is literal "ab" -S g=ab Column G is literal "ab", or "aB", or "Ab", or "AB" =item x Defines the operation on the column =over 2 =item = Literal match -S G=ab Column G is literal "ab" -S g=ab Column G is literal "ab", or "aB", or "Ab", or "AB" =item / Regex match -S G/b[a-z] Column G matches /b[a-z]/ -S g/b[a-z] Column G matches /b[a-z]/i =item u U Check for defined -S Gu Column G is undefined -S GU Column G is defined Similar for emptiness -S G= Column G is defined but empty -S G/. Column G is defined and not empty =item < > Compare. If the value in the column is defined compare. If the values only holds digits, do a numeric compare, otherwise do a string compare. -S G<42 Column G is defined and less than 42 (numeric) -S G<ab Column G is defined and less than 42 (string) -S G>42 Column G is defined and greater than 42 (numeric) -S G>ab Column G is defined and greater than 42 (string) =back =item P pattern or literal string. Quotation might be required differing per OS and shell-environment. =back =item --sl=C When splitting with C<-S> / C<--split>, if all criteria match and a new sheet is to be created, use the value in column C<C> of the matching row as the new sheet label. =item -v [V] =item --verbose[=V] Set verbosity level. Default = 1. No argument will set to 2. =back =head1 SEE ALSO csv2xls - Convert CSV to old Excel =head1 AUTHOR H.Merijn Brand =head1 COPYRIGHT AND LICENSE Copyright (C) 2016-2023 H.Merijn Brand. All rights reserved. This library is free software; you can redistribute and/or modify it under the same terms as Perl itself. =cut
Generated by dwww version 1.15 on Sun Jun 30 10:42:11 CEST 2024.