If you want to have a look at what the code actually does to dig out flaws, here is the current version:
sub sanitize_html($%)
{
my ($html,%tags)=@_;
my (@stack,$clean);
my $entity_re=qr/&(?!\#[0-9]+;|\#x[0-9a-fA-F]+;|amp;)/;
while($html=~/(?:([^<]+)|<([^<>]*)>?)/g)
{
my ($text,$tag)=($1,$2);
if($text)
{
$text=~s/$entity_re/&/g;
$text=~s/>/>/g;
$clean.=$text;
}
else
{
if($tag=~m!^\s*(/?)\s*([a-z0-9_:\-\.]+)(?:\s+(.*?)|)\s*(/?)\s*$!si)
{
my ($closing,$name,$args,$implicit)=($1,lc($2),$3,$4);
if($tags{$name})
{
if($closing)
{
if(grep { $_ eq $name } @stack)
{
my $entry;
do {
$entry=pop @stack;
$clean.="</$entry>";
} until $entry eq $name;
}
}
else
{
my %args;
$args=~s/\s/ /sg;
while($args=~/([a-z0-9_:\-\.]+)(?:\s*=\s*(?:'([^']*?)'|"([^"]*?)"|['"]?([^'" ]*))|)/gi)
{
my ($arg,$value)=(lc($1),defined($2)?$2:defined($3)?$3:$4);
$value=$arg unless defined($value);
my $type=$tags{$name}{args}{$arg};
if($type)
{
my $passes=1;
if($type=~/url/i) { $passes=0 unless $value=~/(?:^$protocol_re:|^[^:]+$)/ }
if($type=~/number/i) { $passes=0 unless $value=~/^[0-9]+$/ }
if($passes)
{
$value=~s/$entity_re/&/g;
if($value=~/"/) { $value="'$value'" }
else { $value="\"$value\"" }
$args{$arg}=$value;
}
}
}
my $cleanargs=join " ",map { "$_=$args{$_}" } keys %args;
$implicit="/" if($tags{$name}{empty});
push @stack,$name unless $implicit;
$clean.="<$name";
$clean.=" $cleanargs" if $cleanargs;
$clean.=" $implicit" if $implicit;
$clean.=">";
}
}
}
}
}
my $entry;
while($entry=pop @stack) { $clean.="</$entry>" }
return $clean;
}