package tasks::ReplaceExternalLinks2;


=for warning
Due to breaking changes in AnomieBOT::API, this task will probably not run
anymore. If you really must run it, try getting a version from before

=begin metadata

Bot:     AnomieBOT
Task:    ReplaceExternalLinks2
BRFA:    Wikipedia:Bots/Requests for approval/AnomieBOT 44
Status:  Completed 2012-04-04
Created: 2010-09-20

Process pages with or links to revert
spam, add archiveurl for geocities cites on or,
change archived geocities links to or, and tag
unarchived geocities links with {{tl|dead link}}.

=end metadata


use utf8;
use strict;

use Data::Dumper;
use POSIX;
use Date::Parse;
use LWP::UserAgent;
use XML::LibXML;
use HTML::Entities ();
use URI;
use AnomieBOT::Task qw/:time/;
use vars qw/@ISA/;

# Marker to indicate where {{dead links}} should be removed
my $rmdl="\x02*\x03";

sub new {
    my $class=shift;
    my $self=$class->SUPER::new();
        agent=>"AnomieBOT link checker for (",
    # Unfortunately, webcite seems to like quoting back the url without
    # encoding ampersands in certain error messages.
    bless $self, $class;
    return $self;


=for info
Approved 2010-10-07.<br />[[Wikipedia:Bots/Requests for approval/AnomieBOT 44]]


sub approved {
    return -1;

sub run {
    my ($self, $api)=@_;
    my $res;

    $api->task('ReplaceExternalLinks2', 0, 10, qw/d::Redirects d::Templates d::Nowiki/);

    my $screwup='Errors? [[User:'.$api->user.'/shutoff/ReplaceExternalLinks2]]';

    # Spend a max of 5 minutes on this task before restarting
    my $endtime=time()+300;

    # Get list of citation templates
    my %templates=$api->redirects_to_resolved(
        'Template:Citation metadata',
        'Template:Cite api',
        'Template:Cite book',
        'Template:Cite conference',
        'Template:Cite IETF',
        'Template:Cite interview',
        'Template:Cite journal',
        'Template:Cite mailing list',
        'Template:Cite news',
        'Template:Cite press release',
        'Template:Cite video',
        'Template:Cite web',
        'Template:Vancite conference',
        'Template:Vancite journal',
        'Template:Vancite news',
        'Template:Vancite web',
        'Template:Vcite conference',
        'Template:Vcite journal',
        'Template:Vcite news',
        'Template:Vcite web',
        $api->warn("Failed to get citation template redirects: ".$templates{''}{'error'}."\n");
        return 60;

    # Get regex for finding {{dead link}}
    my %dl=$api->redirects_to_resolved(
        'Template:Dead link',
        $api->warn("Failed to get dead link template redirects: ".$dl{''}{'error'}."\n");
        return 60;
    my $dlre='{{(?i:\s*Template\s*:)?\s*(?:'.join('|',map { $_="\Q$_\E"; s/^Template\\:(.)/(?i:$1)/; s/\\ /[ _]/g; $_; } keys %dl).')(?>\s*(?s:\|.*?)?}})';

            list        => 'exturlusage',
            euprop      => 'title',
            euquery     => ['*','*'],
            eunamespace => '0',
            eulimit     => '1000', # exturlusage has issues with big lists
    while(my $pg=$self->{'iter'}->next){
            $api->warn("Failed to retrieve page list for ".$self->{'iter'}->iterval.": ".$pg->{'error'}."\n");
            return 60;

        return 0 if $api->halting;
        my $page=$pg->{'title'};
        my $tok=$api->edittoken($page, EditRedir => 1);
        if($tok->{'code'} eq 'shutoff'){
            $api->warn("Task disabled: ".$tok->{'content'}."\n");
            return 300;
        if($tok->{'code'} ne 'success'){
            $api->warn("Failed to get edit token for $page: ".$tok->{'error'}."\n");
            $api->warn("WTF? $page does not exist?\n");

        # Setup flags

        my $intxt=$tok->{'revisions'}[0]{'*'};
        my $outtxt=$intxt;

        # Despam
        $self->{'flags'}{'oo'}=1 if $intxt ne $outtxt;

        # Replace the links. First, do citation templates.
        my $nowiki;
        $outtxt=$api->process_templates($outtxt, sub {
            return undef if $self->{'flags'}{'fail'};
            my $name=shift;
            my $params=shift;
            my $wikitext=shift;
            my $data=shift;
            my $oname=shift;

            return undef unless exists($templates{"Template:$name"});

            my $ret="{{$oname";
            my $archived=0;
            my $url='';
            my ($accessdate,$date,$year,$month);
            foreach ($api->process_paramlist(@$params)){
                if($_->{'name'} eq 'url'){
                } elsif($_->{'name'} eq 'accessdate'){
                } elsif($_->{'name'} eq 'date'){
                } elsif($_->{'name'} eq 'year' && $_->{'value'}=~/^\d+$/){
                } elsif($_->{'name'} eq 'month'){
                } elsif($_->{'name'} eq 'archiveurl'){
            my $r404='';
            if(!$archived && $url=~m!^http://(?:[\w\d-]+\.)*geocities\.com!){
                my ($u,$dt);
                $dt=$accessdate // $date // str2time("1 $month $year") // str2time("1 June $year") // time();
                ($u,$dt,$r404)=chkExtLink($self,$api,0,$url, $dt);
                return undef if($self->{'flags'}{'fail'});
                $ret.="|archiveurl=$u|archivedate=$dt" unless $r404;
            return $ret;
        return 60 if($self->{'flags'}{'fail'});

        # Next, strip for raw link processing
        # Regular expressions are adapted from those MediaWiki uses to
        # recognize external links.
        ($outtxt,$nowiki)=$api->strip_templates($outtxt, sub {
            my $name=shift;
            return exists($templates{"Template:$name"});
        }, {}, $nowiki);

        # Strip out ref tags, then replace any links with a guess at access
        # date.
        ($outtxt,$nowiki)=$api->strip_regex(qr!<ref[ >].*?</ref>!, $outtxt, $nowiki);
        my @arc=qw/[aA]rchive webcitation\.org [wW]ayback/;
        my $arc='(?:'.join('|',@arc).')';
        while(my ($k,$v)=each %$nowiki){
            next unless $v=~/^<ref/;
            next if $v=~/$arc/;
            my ($dt,$nw);

            # We have to re-strip here, because the saved values here are
            # automatically unstripped.
            ($v,$nw)=$api->strip_templates($v, sub {
                my $name=shift;
                return exists($templates{"Template:$name"});
            }, {}, $nw);

            $dt=str2time($1) if $v=~/(?:accessed|retrieved)(?: +on)? +(\d{4}-\d{2}-\d{2}|\d+ \w+,? \d{4}|\w+ \d+,? \d{4})/i;

            $v=~s{\[(http://(?:[\w\d-]+\.)*geocities\.com(?:[/:][^][<>\x22\x00-\x20\x7F]+)?)( *[^\]\x00-\x08\x0a-\x1F]*?)\]}{ chkExtLink($self,$api,1,$1,$dt // time(),$2) }ge;
            return 60 if($self->{'flags'}{'fail'});
            ($v,$nw)=$api->strip_regex(qr{\[http://[^][<>\x22\x00-\x20\x7F]+ *[^\]\x00-\x08\x0a-\x1F]*?\]}, $v, $nw);
            $v=~s{\b(http://[^][<>\x22\x00-\x20\x7F]+)}{ chkExtLink($self,$api,2,$1,$dt // time()) }ge;
            return 60 if($self->{'flags'}{'fail'});

        # Fix any bracketed external link that doesn't have "Archive" or the
        # like in the line after it.
        $outtxt=~s{\[(http://(?:[\w\d-]+\.)*geocities\.com(?:[/:][^][<>\x22\x00-\x20\x7F]+)?)( *[^\]\x00-\x08\x0a-\x1F]*?)\](?!.*$arc)}{ chkExtLink($self,$api,1,$1,time(),$2) }ge;
        return 60 if($self->{'flags'}{'fail'});

        # Hide all bracketed external links. We have to keep track of the
        # replacement token for the ones that have "Archive" etc in their
        # display text.
        ($outtxt,$nowiki)=$api->strip_regex(qr{\[http://[^][<>\x22\x00-\x20\x7F]+ *[^\]\x00-\x08\x0a-\x1F]*?\]}, $outtxt, $nowiki);
        while(my ($k,$v)=each %$nowiki){
            push @arc, $k if $v=~m!^\[http://[^][<>\x22\x00-\x20\x7F]+ *.*$arc!;

        # Fix any bare external link that doesn't have "Archive" or the like in
        # the line after it.
        $outtxt=~s{\b(http://[^][<>\x22\x00-\x20\x7F]+)(?!.*$arc)}{ chkExtLink($self,$api,2,$1,time()) }ge;
        return 60 if($self->{'flags'}{'fail'});

        # Unstrip

        # rm marked {{dead link}} templates (and $rmdl markers)

        # rm duplicate {{dead link}} templates too

        if($outtxt ne $intxt){
            my @summary=();
            push @summary, 'reverting spam' if $self->{'flags'}{'oo'};
            push @summary, 'adding archiveurl for archived geocities cites' if $self->{'flags'}{'cite'};
            push @summary, 'changing archived geocities links' if $self->{'flags'}{'link'};
            push @summary, 'tagging dead geocities links' if $self->{'flags'}{'404'};
                $api->warn("Changes made with no summary for $page, not editing");
            $summary[$#summary]='and '.$summary[$#summary] if @summary>1;
            my $summary=ucfirst(join((@summary>2)?', ':' ', @summary));
            $api->log("$summary in $page");
            my $r=$api->edit($tok, $outtxt, "$summary. $screwup", 1, 1);
            if($r->{'code'} ne 'success'){
                $api->warn("Write failed on $page: ".$r->{'error'}."\n");

        # If we've been at it long enough, let another task have a go.
        return 0 if time()>=$endtime;

    $api->log("May be DONE!");
    return 600;

sub chkExtLink {
    my $self=shift;
        return wantarray?('fail','fail','fail'):'fail';

    my $api=shift;
    my $fmt=shift;
    my $url=shift;
    my $date=shift;
    my $txt='';

        # Duplicate Mediawiki post-processing of bare external links
        $txt=$1.$txt if $url=~s/((?:[<>]|&[lg]t;).*$)//;
        my $sep=',;\.:!?';
        $sep.=')' unless $url=~/\(/;
        $txt=$1.$txt if $url=~s/([$sep]+$)//;

        # There shouldn't be a template inside the url
        $txt=$1.$txt if $url=~s/(\{\{.*$)//;

        return $url.$txt unless $url=~m!^http://(?:[\w\d-]+\.)*geocities\.com[/:]!;

    # Get archive link and date
    my @archives;
    my ($u, $dt);
    } else {
        $api->log("... Checking $u");

        # Screen-scrape
        my $r=$self->{'ua'}->get($u);
            foreach $_ ($r->decoded_content=~m!href="(\d+/[^\x22]*)"!g) {
                $_ = HTML::Entities::decode($_);
                $api->log("... Got $_");

                } else {
                push @archives, [$dt, $_];
        } elsif($r->code eq '404'){
            $api->log("... Failed with ".$r->code);
        } elsif($r->code eq '403' && $r->decoded_content=~m!<p class="mainTitle">Blocked Site Error.<br><br>\s*</p>\s*<p class="mainBigBody"><i>\Q$url\E</i> is not available in the Wayback Machine!){
            $api->log("... Failed with 403 'not available in the Wayback Machine'");
        } else {
            $api->log("... Failed with ".$r->code.", will retry later");
            return chkExtLink($self);

        # check webcite too
        $api->log("... Checking $u");
            my $xml=$self->{'xml'}->load_xml(string=>$r->decoded_content);
                foreach $_ (@{$xml->find('//result[@status=\'success\']')}){
                    my $uu=URI->new('');
                    # Not exactly RFC-compliant, but it works fine
                    $uu=~s/%3A/:/g; $uu=~s/%2F/\//g;
                    $api->log("... Got $uu");
                    push @archives, [str2time($dt) // time(), $uu];
            } else {
                $api->log("... Invalid XMl data");
                return chkExtLink($self);
        } elsif($r->code eq '404'){
            $api->log("... Failed with ".$r->code);
        } else {
            $api->log("... Failed with ".$r->code.", will retry later");
            return chkExtLink($self);


    # Then pull the closest archive to the accessdate or whatever.
    my ($diff,$r404)=(1e100,'{{dead link|date='.strftime('%B %Y', gmtime).'|bot='.$api->user.'}}');
    foreach $_ (@archives){
        if(abs($_->[0] - $date) < $diff){
            $diff=abs($_->[0] - $date);

    } elsif($fmt==0){
    } else {

    if($fmt==0){ # cite template
        return ($u,strftime('%Y-%m-%d',gmtime($dt // 0)),$r404);
    } elsif($fmt==1){ # Bracketed external link
        my $txt=shift;
        return $r404?"[$url$txt]$r404":"[$u$txt]$rmdl";
    } elsif($fmt==2){ # Bare external link
        return ($r404?"[$url $url]$r404":"$u$rmdl").$txt.$rmdl;
    } else {
        return undef;
