dwww Home | Show directory contents | Find package

#!/usr/bin/perl

# csv-check: Check validity of CSV file and report
#          (m)'20 [21 May 2020] Copyright H.M.Brand 2007-2023

# This code requires the defined-or feature and PerlIO

use 5.012000;
use warnings;

use Data::Peek;
use Encode qw( decode encode );

our $VERSION = "2.05";  # 2020-05-21
my $cmd = $0; $cmd =~ s{.*/}{};

sub usage {
    my $err = shift and select STDERR;
    print <<"EOU";
usage: $cmd [-s <sep>] [-q <quot>] [-e <esc>] [-u] [--pp] [file.csv]
    -s S  --sep=S         use S as seperator char. Auto-detect, default = ','
                          the string "tab" is allowed.
    -q Q  --quo=Q         use Q as quote     char. Auto-detect, default = '"'
                          the string "undef" will disable quotation.
    -e E  --esc=E         use E as escape    char. Auto-detect, default = '"'
                          the string "undef" will disable escapes.
    -N    --nl            force EOL to \\n
    -C    --cr            force EOL to \\r
    -M    --crnl          force EOL to \\r\\n
    -u    --utf-8         check if all fields are valid unicode
    -E E  --enc=E         open file with encoding E
    -h    --hdr           check with header (implies BOM)
    -b    --bom           check with BOM (no header)
    -f    --skip-formula  do not check formula's

          --pp            use Text::CSV_PP instead (cross-check)

    -A a  --attr=at:val   pass attributes to parser
                            --at=val is also supported for know attributes
    -L    --list-attr     list supported CSV attributes
    -X    --list-changes  list attributes that changed from default
EOU
    exit $err;
    } # usage

use Getopt::Long qw(:config bundling passthrough);
my $eol;
GetOptions (
    "help|?"            => sub { usage (0); },
    "V|version"         => sub { say "$cmd [$VERSION]"; exit 0; },

    "c|s|sep=s"         => \(my  $sep   = ""),
    "q|quo|quote=s"     => \(my  $quo   = '"'),
    "e|esc|escape=s"    => \(my  $esc   = '"'),
    "N|nl!"             => sub { $eol   = "\n"; },
    "C|cr!"             => sub { $eol   = "\r"; },
    "M|crnl!"           => sub { $eol   = "\r\n"; },
    "B|binary!"         => \(my  $bin   = 1),

    "u|utf|utf8|utf-8!" => \(my  $opt_u = 0),
    "E|enc|encoding=s"  => \(my  $enc),
    "h|hdr|header!"     => \(my  $opt_h = 0),
    "b|bom!"            => \(my  $opt_b = 0),
    "f|skip-formula!"   => \(my  $opt_f = 0),

    "A|attr=s"          => \ my  @opt_A,
    "L|list-attr!"      => \ my  $opt_L,
    "X|list-changes!"   => \ my  $opt_X,

    "pp!"               => \(my  $opt_p = 0),

    "v|verbose:1"       => \(my  $opt_v = 0),
    ) or usage (1);
$opt_X and $opt_L++;

my  $csvmod = "Text::CSV_XS";
if ($opt_p) {
    require Text::CSV_PP;
    $csvmod = "Text::CSV_PP";
    }
else {
    require Text::CSV_XS;
    }
$csvmod->import ();

binmode STDOUT, ":encoding(utf-8)";
binmode STDERR, ":encoding(utf-8)";

my $fn   = $ARGV[0] // "-";
my @warn;

my %csvarg = (
    sep_char       => $sep eq "tab"   ? "\t"  : $sep,
    quote_char     => $quo eq "undef" ? undef : $quo,
    escape_char    => $esc eq "undef" ? undef : $esc,
    eol            => $eol,
    binary         => $bin,
    keep_meta_info => 1,
    auto_diag      => 1,
    formula        => $opt_f ? "none" : "diag",
    );
{   my $p = $csvmod->new;
    my %ka = map { $_ => $p->{$_} } grep m/^[a-z]/ => $p->known_attributes;
    foreach my $i (reverse 0 .. $#ARGV) {
        if ($ARGV[$i] =~ m/^--(no[-_])?+([-\w]+)(?:=(.*))?$/) {
            my ($attr, $val) = (lc $2 =~ tr/-/_/r, $3 // ($1 ? 0 : 1));
            if (exists $ka{$attr}) {
                unshift @opt_A, "$attr:$val";
                splice @ARGV, $i, 1;
                }
            }
        }
    for (@opt_A) {
        m/^([-\w]+)(?:[:=](.*))?/ or next;
        my ($attr, $val) = (lc $1 =~ tr/-/_/r, $2 // 1);
        exists $ka{$attr} or next;
        $val eq "undef" || !length $val and $val = undef;       # -A escape_char:
        $csvarg{$attr} = $val;
        }
    if ($opt_L) {
        $csvarg{sep_char} ||= $ka{sep_char};
        foreach my $attr (sort keys %ka) {
            $ka{$attr}     //= "(undef)";
            $csvarg{$attr} //= $ka{$attr};
            $opt_X and $csvarg{$attr} eq $ka{$attr} and next;
            printf "  %-21s : %s\n", $attr, $csvarg{$attr};
            }
        exit 0;
        }
    }
$opt_v > 1 and DDumper \%csvarg;

my $data = do { local $/; <> } or die "No data to analyze\n";

my ($rows, %cols, $s_eol) = (0);
unless ($sep) { # No sep char passed, try to auto-detect;
    my ($first_line) = ($data =~ m/\A(.*?)(?:\r\n|\n|\r)/);
    $first_line ||= $data; # if no EOL at all, use whole set
    $sep = $first_line =~ m/["\d],["\d,]/ ? ","  :
           $first_line =~ m/["\d];["\d;]/ ? ";"  :
           $first_line =~ m/["\d]\t["\d]/ ? "\t" :
           # If neither, then for unquoted strings
           $first_line =~ m/\w,[\w,]/     ? ","  :
           $first_line =~ m/\w;[\w;]/     ? ";"  :
           $first_line =~ m/\w\t[\w]/     ? "\t" : ",";
    $data =~ m/([\r\n]+)\Z/ and $s_eol = DDisplay "$1";
    $csvarg{sep_char} = $sep;
    }

my $csv = $csvmod->new (\%csvarg);
$opt_v > 8 and DDumper $csv;

$bin = 0; # Assume ASCII only

sub done {
    my $file = $ARGV // "STDIN";
    (my $pv = "$]0000000") =~ s{^([0-9]+)\.([0-9]{3})([0-9]{3})[0-9]*}
                               {sprintf "%d.%d.%d",$1,$2,$3}e;
    my $uv = eval {
        no warnings;
        (my $cv = $]) =~ s/0+$//;
        eval { require Unicode::UCD;     Unicode::UCD::UnicodeVersion () } ||
        eval { require Module::CoreList; $Module::CoreList::version{$cv}{Unicode} };
        } || "unknown";
    say "Checked $file with $cmd $VERSION\nusing $csvmod @{[$csvmod->VERSION]} with perl $pv and Unicode $uv";
    my @diag = $csv->error_diag;
    my $line = $. // $csv->record_number // "?";
    if ($diag[0] == 2012 && $csv->eof) {
        my @coll = sort { $a <=> $b } keys %cols;
        local $" = ", ";
        my $cols = @coll == 1 ? $coll[0] : "(@coll)";
        $s_eol //= $csv->eol || "--unknown--";
        $s_eol =~ m/[\x00-\x1f]/ and $s_eol = DDisplay $s_eol;
        say "OK: rows: $rows, columns: $cols";
        say "    sep = <$sep>, quo = <$quo>, bin = <$bin>, eol = <$s_eol>";
        say "    encoding = $csv->{ENCODING}" if $csv->{ENCODING};
        if (@coll > 1) {
            say "multiple column lengths:";
            printf " %6d line%s with %4d field%s\n",
                $cols{$_}, $cols{$_} == 1 ? " " : "s",
                $_,        $_        == 1 ? ""  : "s"
                    for @coll;
            }
        $diag[0] = 0;
        }
    elsif ($diag[2]) {
        say "$ARGV record $diag[3] at line $line/$diag[2] - $diag[0] - $diag[1]";
        my $ep  = $diag[2] - 1; # diag[2] is 1-based
        my $ei  = $csv->error_input;
        if (defined $ei) {
            my $l = 0;
            my $s = "";
            eval { my $u = decode ("utf-8", $ei); $ei = $u };
            for (split m/([^ -~])/ => $ei) {
                if (m/^[ -~]+$/) {
                    $s .= $_;
                    $l += length;
                    next;
                    }
                if ($_ eq "\t") {
                    $s .= "\\t";
                    $ep > $l and $ep++;
                    $l += 2;
                    next;
                    }
                if ($_ eq "\n") {
                    $s .= "\\n";
                    $ep > $l and $ep++;
                    $l += 2;
                    next;
                    }
                if ($_ eq "\r") {
                    $s .= "\\r";
                    $ep > $l and $ep++;
                    $l += 2;
                    next;
                    }
                $s .= sprintf "\\x{%05x}", ord;
                $ep > $l and $ep += 9 - length encode "utf-8", $_;
                $l += 9;
                }

            say "    |$s|"; #           2b06
            say "    |", " " x $ep, "\x{25b2}", " " x (length ($s) - $ep - 1), "|";
            }
        }
    else {
        say "$ARGV line $line - $diag[1]";
        }
    print for @warn;
    exit $diag[0];
    } # done

sub show {
    say STDERR join ", " => map { "\x{231e}$_\x{231d}" } @_;
    } # show

sub stats {
    my $r = shift;
    $cols{scalar @$r}++;
    grep { $_ & 0x0002 } $csv->meta_info and $bin = 1;
    $opt_v > 2 and show (@$r);
    if ($opt_u) {
        my @r = @$r;
        foreach my $x (0 .. $#r) {
            utf8::is_utf8 ($r[$x]) and next;

            local $SIG{__WARN__} = sub {
                (my $msg = shift) =~ s{ at /\S+Encode.pm.*}{};
                my @h = $csv->column_names;
                push @warn, sprintf "Field %d%s in record %d - '%s'\t- %s",
                    $x + 1, @h ? " (column: '$h[$x]')" : "", $rows,
                    DPeek ($r[$x]), $msg;
                };
            my $oct = decode ("utf-8", $r[$x], Encode::FB_WARN);
            }
        }
    } # stats

my $mode = $enc ? "<:encoding($enc)" : "<";
open my $fh, $mode, \$data or die "$fn: $!\n";
if ($opt_h) {
    $csv->header ($fh);
    }
elsif ($opt_b) {
    my @hdr = $csv->header ($fh, { detect_bom => 1, set_column_names => 0 });
    stats \@hdr;
    }

local $SIG{__WARN__} = sub { push @warn, @_; };
while (my $row = $csv->getline ($fh)) {
    $rows++;
    stats $row;
    }
done;

Generated by dwww version 1.15 on Sun Jun 30 10:48:44 CEST 2024.