dwww Home | Show directory contents | Find package

#!/usr/bin/perl

# This script cleans up an HTML document

use strict;
use warnings;
use HTML::Parser ();

# configure these values
my @ignore_attr = qw(bgcolor background color face style link alink vlink text
    onblur onchange onclick ondblclick onfocus onkeydown onkeyup onload
    onmousedown onmousemove onmouseout onmouseover onmouseup
    onreset onselect onunload
);
my @ignore_tags     = qw(font big small b i);
my @ignore_elements = qw(script style);

# make it easier to look up attributes
my %ignore_attr = map { $_ => 1 } @ignore_attr;

sub tag {
    my ($pos, $text) = @_;
    if (@$pos >= 4) {

        # kill some attributes
        my ($k_offset, $k_len, $v_offset, $v_len) = @{$pos}[-4 .. -1];
        my $next_attr = $v_offset ? $v_offset + $v_len : $k_offset + $k_len;
        my $edited;
        while (@$pos >= 4) {
            ($k_offset, $k_len, $v_offset, $v_len) = splice @$pos, -4;
            if ($ignore_attr{lc substr($text, $k_offset, $k_len)}) {
                substr($text, $k_offset, $next_attr - $k_offset) = "";
                $edited++;
            }
            $next_attr = $k_offset;
        }

        # if we killed all attributed, kill any extra whitespace too
        $text =~ s/^(<\w+)\s+>$/$1>/ if $edited;
    }
    print $text;
}

sub decl {
    my $type = shift;
    print shift if $type eq "doctype";
}

sub text {
    print shift;
}

HTML::Parser->new(
    api_version   => 3,
    start_h       => [\&tag, "tokenpos, text"],
    process_h     => ["", ""],
    comment_h     => ["", ""],
    declaration_h => [\&decl, "tagname, text"],
    default_h     => [\&text, "text"],

    ignore_tags     => \@ignore_tags,
    ignore_elements => \@ignore_elements,
)->parse_file(shift)
    || die "Can't open file: $!\n";

Generated by dwww version 1.15 on Sun Jun 30 10:46:11 CEST 2024.