SlideShare una empresa de Scribd logo
1 de 23
Descargar para leer sin conexión
All YOUR PAGE ARE BELONG TO US
    すべてのウェブページをこの手に

            2012/11/16

           株式会社はてな
          大西康裕 id:onishi
  id:onishi 大西康裕
  ONISHI
  @yasuhiro_onishi
  株式会社はてな
  はてなブログ
ウェブページを
保存したい
ウェブページを保存したい
 •ウェブページは日々変化する
 •手元に置いておきたい
 •競合調査
 • 魚拓

 •画像などまとめて保存したい
Google
Chrome
HTML::Parser
my $result;
my $parser    = HTML::Parser->new(
    start_h      => [ sub {}, 'self,tagname,attr,text' ],
    default_h    => [ sub {}, 'self,text' ],
);
$parser->parse($content);
print $result;


   •   text
   •   start
   •   end
   •   process
   •   declaration
   •   comment
   •   default
HTML::Parser
start_h => [
    sub {
        my($self, $tagname, $attr, $text) = @_;
        $result .= "<$tagname";
        for my $key (sort keys %$attr) {
             my $value = $attr->{$key};
             if ($key =~ /^(?:src)$/i) {
                 # HTTP GET して保存してローカルパスにする
                 $value = get_src($value);
             }
             $result .= qq{ $key="$value"};
         }
         $result .= ">";
     },
     'self,tagname,attr,text',
],
HTML::Parser
default_h => [
    sub {
        my($self, $text) = @_;
        $result .= $text;
    },
    'self,text',
],
完
CSSから参照
$content =~ s{url(([^)]+))}{
    my $link = $1;

       # relative link (from HTML::ResolveLink)
       my $u = URI->new($link);
       unless (defined $u->scheme) {
           my $old = $u;
           $u = $u->abs($url);
       }
       $link = get_src($u); # HTTP GET して保存してローカルパスに
       "url($link)";
}eg;
script 殺す
my $context = { disallow => 0 };
my $disallow_tag = qr{script};
start_h => [sub {
    if ($tagname =~ /^(?:$disallow_tag)$/i) {
        $context->{disallow}++; return;
    }
}],
end_h => [sub {
    if ($tagname =~ /^(?:$disallow_tag)$/i) {
        $context->{disallow}--; return;
    }
}],
default_h => [sub {
    if ($context->{disallow} > 0) {
        return;
    }
}],
noscript 内を生かす
my $nodisplay_tag = qr{noscript};

start_h => [sub {
    if ($tagname =~ /^(?:$nodisplay_tag)$/i) {
        return;
    }
}],
end_h => [sub {
    if ($tagname =~ /^(?:$nodisplay_tag)$/i) {
        return;
    }
}],
base


start_h => [sub {
    if ($tagname =~ /^(?:base)$/i and $key =~ /^(?:href)$/i)   {
        $value = "./";
    }
}],
できました!

gist.github.com/

   4071196
#!/usr/bin/env perl
use strict;
use warnings;
use utf8;

use    DateTime;
use    Digest::SHA1 qw(sha1_hex);
use    Encode;
use    File::Path qw/make_path/;
use    HTML::Parser;
use    HTML::ResolveLink;
use    HTTP::Request::Common qw/GET/;
use    IO::All;
use    LWP::UserAgent;
use    URI;

my $path = './';

my    $uri        =   URI->new(shift) or die;
my    $now        =   DateTime->now;
my    $ymd        =   $now->ymd;
my    $ua         =   LWP::UserAgent->new(agent => 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)');
my    $resolver   =   HTML::ResolveLink->new(base => $uri);
my    $res        =   $ua->request(GET $uri);
my    $content    =   $resolver->resolve($res->decoded_content);

my $dir           = $uri;
   $dir           =~ s{[^A-Za-z0-9.]+}{-}g;
   $dir           =~ s{-+$}{};
   $dir           = "$path/$dir/$ymd/";
   $dir           =~ s{/+}{/}g;

make_path($dir);

my $disallow_tag = qr{script};
my $nodisplay_tag = qr{noscript};

my $result;
my $context = { disallow => 0 };
my $parser    = HTML::Parser->new(
    api_version => 3,
    start_h      => [
        sub {
            my($self, $tagname, $attr, $text) = @_;
            if ($tagname =~ /^(?:$nodisplay_tag)$/i) {
                 return;
            } elsif ($tagname =~ /^(?:$disallow_tag)$/i) {
                 $context->{disallow}++;
                 return;
            }
            $result .= "<$tagname";
            for my $key (sort keys %$attr) {
                 $key eq '/' and next;
                 my $value = $attr->{$key};
                 if ($key =~ /^(?:src)$/i) {
                      $value = get_src($value);
                 } elsif ($tagname =~ /^(?:link)$/i and $key =~ /^(?:href)$/i)        {
                      $value = get_link($value);
                 } elsif ($tagname =~ /^(?:base)$/i and $key =~ /^(?:href)$/i)        {
                      $value = $path;
                 }
                 $result .= qq{ $key="$value"};
            }
            $result .= ">";
        },
        'self,tagname,attr,text',
    ],
    end_h      => [
        sub {
            my($self, $tagname, $text) = @_;
            if ($tagname =~ /^(?:$nodisplay_tag)$/i) {
                 return;
            } elsif ($tagname =~ /^(?:$disallow_tag)$/i) {
                 $context->{disallow}--;
                 return;
            }
            $result .= $text;
        },
        'self,tagname,text',
    ],
    default_h    => [
        sub {
            my($self, $text) = @_;
            if ($context->{disallow} > 0) {
                 return;
            }
            $result .= $text;
        },
        'self,text',
    ],
);

$parser->parse($content);

$result =~ s{(<head[^>]*>)}{$1<meta http-equiv="Content-Type" content="text/html; charset=utf-8">}i; # XXX

$result = Encode::encode('utf-8', $result);

$result > io("${dir}index.html");

print "${dir}index.htmln";

sub get_src {
    my $src = shift or return;
    unless (-e "${dir}file") {
        make_path("${dir}file");
    }
    my $file = $src;
    $file =~ s{[^A-Za-z0-9.]+}{-}g;
    if (length($file) > 255) {
        $file = sha1_hex($file);
    }
    $file = "file/$file";
    $file =~ s{/+}{/}g;
    unless (-e "$dir$file") {
        $ua->request(GET $src)->content >> io("$dir$file");
        sleep(1); # DOS対策対策
       }
       $file;
}

sub get_link {
    my $url = shift or return;
    my $file = get_src($url);
    my $io = io("$dir$file");
    my $content = $io->slurp;
    $content =~ s{url(([^)]+))}{
        my $link = $1;
        $link =~ s{^[s"']+}{};
        $link =~ s{[s"']+$}{};

           # relative link (from HTML::ResolveLink)
           my $u = URI->new($link);
           unless (defined $u->scheme) {
               my $old = $u;
               $u = $u->abs($url);
           }
           $link = get_src($u);
           $link =~ s{^file/}{};
           "url($link)";
       }eg;
       $content > $io;
       return $file;
}
Google
Chrome
wget.pl
どうぞご利用ください!

gist.github.com/

   4071196
ご清聴ありがとうございました

Más contenido relacionado

La actualidad más candente

introduction to Django in five slides
introduction to Django in five slides introduction to Django in five slides
introduction to Django in five slides Dan Chudnov
 
An Elephant of a Different Colour: Hack
An Elephant of a Different Colour: HackAn Elephant of a Different Colour: Hack
An Elephant of a Different Colour: HackVic Metcalfe
 
The History of PHPersistence
The History of PHPersistenceThe History of PHPersistence
The History of PHPersistenceHugo Hamon
 
Database Management - Lecture 4 - PHP and Mysql
Database Management - Lecture 4 - PHP and MysqlDatabase Management - Lecture 4 - PHP and Mysql
Database Management - Lecture 4 - PHP and MysqlAl-Mamun Sarkar
 
Database Design Patterns
Database Design PatternsDatabase Design Patterns
Database Design PatternsHugo Hamon
 
Uncovering Iterators
Uncovering IteratorsUncovering Iterators
Uncovering Iteratorssdevalk
 
PHP Lecture 4 - Working with form, GET and Post Methods
PHP Lecture 4 - Working with form, GET and Post MethodsPHP Lecture 4 - Working with form, GET and Post Methods
PHP Lecture 4 - Working with form, GET and Post MethodsAl-Mamun Sarkar
 
PHP for Adults: Clean Code and Object Calisthenics
PHP for Adults: Clean Code and Object CalisthenicsPHP for Adults: Clean Code and Object Calisthenics
PHP for Adults: Clean Code and Object CalisthenicsGuilherme Blanco
 
Doctrine fixtures
Doctrine fixturesDoctrine fixtures
Doctrine fixturesBill Chang
 
Perl Fitxers i Directoris
Perl Fitxers i DirectorisPerl Fitxers i Directoris
Perl Fitxers i Directorisfrankiejol
 
Object Calisthenics Applied to PHP
Object Calisthenics Applied to PHPObject Calisthenics Applied to PHP
Object Calisthenics Applied to PHPGuilherme Blanco
 

La actualidad más candente (20)

Database api
Database apiDatabase api
Database api
 
Inc
IncInc
Inc
 
introduction to Django in five slides
introduction to Django in five slides introduction to Django in five slides
introduction to Django in five slides
 
An Elephant of a Different Colour: Hack
An Elephant of a Different Colour: HackAn Elephant of a Different Colour: Hack
An Elephant of a Different Colour: Hack
 
The History of PHPersistence
The History of PHPersistenceThe History of PHPersistence
The History of PHPersistence
 
Database Management - Lecture 4 - PHP and Mysql
Database Management - Lecture 4 - PHP and MysqlDatabase Management - Lecture 4 - PHP and Mysql
Database Management - Lecture 4 - PHP and Mysql
 
PHP pod mikroskopom
PHP pod mikroskopomPHP pod mikroskopom
PHP pod mikroskopom
 
Database Design Patterns
Database Design PatternsDatabase Design Patterns
Database Design Patterns
 
Php
PhpPhp
Php
 
Php (1)
Php (1)Php (1)
Php (1)
 
Uncovering Iterators
Uncovering IteratorsUncovering Iterators
Uncovering Iterators
 
Agile database access with CakePHP 3
Agile database access with CakePHP 3Agile database access with CakePHP 3
Agile database access with CakePHP 3
 
linieaire regressie
linieaire regressielinieaire regressie
linieaire regressie
 
PHP Lecture 4 - Working with form, GET and Post Methods
PHP Lecture 4 - Working with form, GET and Post MethodsPHP Lecture 4 - Working with form, GET and Post Methods
PHP Lecture 4 - Working with form, GET and Post Methods
 
PHP for Adults: Clean Code and Object Calisthenics
PHP for Adults: Clean Code and Object CalisthenicsPHP for Adults: Clean Code and Object Calisthenics
PHP for Adults: Clean Code and Object Calisthenics
 
Doctrine fixtures
Doctrine fixturesDoctrine fixtures
Doctrine fixtures
 
Perl Fitxers i Directoris
Perl Fitxers i DirectorisPerl Fitxers i Directoris
Perl Fitxers i Directoris
 
CGI.pm - 3ло?!
CGI.pm - 3ло?!CGI.pm - 3ло?!
CGI.pm - 3ло?!
 
Table through php
Table through phpTable through php
Table through php
 
Object Calisthenics Applied to PHP
Object Calisthenics Applied to PHPObject Calisthenics Applied to PHP
Object Calisthenics Applied to PHP
 

Destacado

Redmine::ChanでIRCからプロジェクト管理
Redmine::ChanでIRCからプロジェクト管理Redmine::ChanでIRCからプロジェクト管理
Redmine::ChanでIRCからプロジェクト管理Yasuhiro Onishi
 
Webサーバ勉強会#5
Webサーバ勉強会#5Webサーバ勉強会#5
Webサーバ勉強会#5oranie Narut
 
ウェブアプリケーションのパフォーマンスチューニング
ウェブアプリケーションのパフォーマンスチューニングウェブアプリケーションのパフォーマンスチューニング
ウェブアプリケーションのパフォーマンスチューニングYasuhiro Onishi
 
Hatena blogdevelopmentflow
Hatena blogdevelopmentflowHatena blogdevelopmentflow
Hatena blogdevelopmentflowYasuhiro Onishi
 

Destacado (6)

開発合宿!!!!
開発合宿!!!!開発合宿!!!!
開発合宿!!!!
 
Redmine::ChanでIRCからプロジェクト管理
Redmine::ChanでIRCからプロジェクト管理Redmine::ChanでIRCからプロジェクト管理
Redmine::ChanでIRCからプロジェクト管理
 
oEmbed と Text::Hatena
oEmbed と Text::HatenaoEmbed と Text::Hatena
oEmbed と Text::Hatena
 
Webサーバ勉強会#5
Webサーバ勉強会#5Webサーバ勉強会#5
Webサーバ勉強会#5
 
ウェブアプリケーションのパフォーマンスチューニング
ウェブアプリケーションのパフォーマンスチューニングウェブアプリケーションのパフォーマンスチューニング
ウェブアプリケーションのパフォーマンスチューニング
 
Hatena blogdevelopmentflow
Hatena blogdevelopmentflowHatena blogdevelopmentflow
Hatena blogdevelopmentflow
 

Similar a wget.pl

Perl Bag of Tricks - Baltimore Perl mongers
Perl Bag of Tricks  -  Baltimore Perl mongersPerl Bag of Tricks  -  Baltimore Perl mongers
Perl Bag of Tricks - Baltimore Perl mongersbrian d foy
 
Simple Ways To Be A Better Programmer (OSCON 2007)
Simple Ways To Be A Better Programmer (OSCON 2007)Simple Ways To Be A Better Programmer (OSCON 2007)
Simple Ways To Be A Better Programmer (OSCON 2007)Michael Schwern
 
Drupal Development (Part 2)
Drupal Development (Part 2)Drupal Development (Part 2)
Drupal Development (Part 2)Jeff Eaton
 
Crazy things done on PHP
Crazy things done on PHPCrazy things done on PHP
Crazy things done on PHPTaras Kalapun
 
(Ab)Using the MetaCPAN API for Fun and Profit
(Ab)Using the MetaCPAN API for Fun and Profit(Ab)Using the MetaCPAN API for Fun and Profit
(Ab)Using the MetaCPAN API for Fun and ProfitOlaf Alders
 
Developing applications for performance
Developing applications for performanceDeveloping applications for performance
Developing applications for performanceLeon Fayer
 
Adventures in Optimization
Adventures in OptimizationAdventures in Optimization
Adventures in OptimizationDavid Golden
 
The Art of Transduction
The Art of TransductionThe Art of Transduction
The Art of TransductionDavid Stockton
 
Writing Maintainable Perl
Writing Maintainable PerlWriting Maintainable Perl
Writing Maintainable Perltinypigdotcom
 
Advanced php testing in action
Advanced php testing in actionAdvanced php testing in action
Advanced php testing in actionJace Ju
 
Designing Opeation Oriented Web Applications / YAPC::Asia Tokyo 2011
Designing Opeation Oriented Web Applications / YAPC::Asia Tokyo 2011Designing Opeation Oriented Web Applications / YAPC::Asia Tokyo 2011
Designing Opeation Oriented Web Applications / YAPC::Asia Tokyo 2011Masahiro Nagano
 

Similar a wget.pl (20)

Perl Bag of Tricks - Baltimore Perl mongers
Perl Bag of Tricks  -  Baltimore Perl mongersPerl Bag of Tricks  -  Baltimore Perl mongers
Perl Bag of Tricks - Baltimore Perl mongers
 
Bag of tricks
Bag of tricksBag of tricks
Bag of tricks
 
Daily notes
Daily notesDaily notes
Daily notes
 
Ae internals
Ae internalsAe internals
Ae internals
 
Simple Ways To Be A Better Programmer (OSCON 2007)
Simple Ways To Be A Better Programmer (OSCON 2007)Simple Ways To Be A Better Programmer (OSCON 2007)
Simple Ways To Be A Better Programmer (OSCON 2007)
 
PHP POWERPOINT SLIDES
PHP POWERPOINT SLIDESPHP POWERPOINT SLIDES
PHP POWERPOINT SLIDES
 
DBI
DBIDBI
DBI
 
Perl6 in-production
Perl6 in-productionPerl6 in-production
Perl6 in-production
 
Drupal Development (Part 2)
Drupal Development (Part 2)Drupal Development (Part 2)
Drupal Development (Part 2)
 
Crazy things done on PHP
Crazy things done on PHPCrazy things done on PHP
Crazy things done on PHP
 
My shell
My shellMy shell
My shell
 
Perl5i
Perl5iPerl5i
Perl5i
 
(Ab)Using the MetaCPAN API for Fun and Profit
(Ab)Using the MetaCPAN API for Fun and Profit(Ab)Using the MetaCPAN API for Fun and Profit
(Ab)Using the MetaCPAN API for Fun and Profit
 
Developing applications for performance
Developing applications for performanceDeveloping applications for performance
Developing applications for performance
 
Adventures in Optimization
Adventures in OptimizationAdventures in Optimization
Adventures in Optimization
 
The Art of Transduction
The Art of TransductionThe Art of Transduction
The Art of Transduction
 
Presentation1
Presentation1Presentation1
Presentation1
 
Writing Maintainable Perl
Writing Maintainable PerlWriting Maintainable Perl
Writing Maintainable Perl
 
Advanced php testing in action
Advanced php testing in actionAdvanced php testing in action
Advanced php testing in action
 
Designing Opeation Oriented Web Applications / YAPC::Asia Tokyo 2011
Designing Opeation Oriented Web Applications / YAPC::Asia Tokyo 2011Designing Opeation Oriented Web Applications / YAPC::Asia Tokyo 2011
Designing Opeation Oriented Web Applications / YAPC::Asia Tokyo 2011
 

Último

Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...Drew Madelung
 
GenAI Risks & Security Meetup 01052024.pdf
GenAI Risks & Security Meetup 01052024.pdfGenAI Risks & Security Meetup 01052024.pdf
GenAI Risks & Security Meetup 01052024.pdflior mazor
 
Tata AIG General Insurance Company - Insurer Innovation Award 2024
Tata AIG General Insurance Company - Insurer Innovation Award 2024Tata AIG General Insurance Company - Insurer Innovation Award 2024
Tata AIG General Insurance Company - Insurer Innovation Award 2024The Digital Insurer
 
How to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected WorkerHow to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected WorkerThousandEyes
 
Real Time Object Detection Using Open CV
Real Time Object Detection Using Open CVReal Time Object Detection Using Open CV
Real Time Object Detection Using Open CVKhem
 
04-2024-HHUG-Sales-and-Marketing-Alignment.pptx
04-2024-HHUG-Sales-and-Marketing-Alignment.pptx04-2024-HHUG-Sales-and-Marketing-Alignment.pptx
04-2024-HHUG-Sales-and-Marketing-Alignment.pptxHampshireHUG
 
ProductAnonymous-April2024-WinProductDiscovery-MelissaKlemke
ProductAnonymous-April2024-WinProductDiscovery-MelissaKlemkeProductAnonymous-April2024-WinProductDiscovery-MelissaKlemke
ProductAnonymous-April2024-WinProductDiscovery-MelissaKlemkeProduct Anonymous
 
Powerful Google developer tools for immediate impact! (2023-24 C)
Powerful Google developer tools for immediate impact! (2023-24 C)Powerful Google developer tools for immediate impact! (2023-24 C)
Powerful Google developer tools for immediate impact! (2023-24 C)wesley chun
 
TrustArc Webinar - Unlock the Power of AI-Driven Data Discovery
TrustArc Webinar - Unlock the Power of AI-Driven Data DiscoveryTrustArc Webinar - Unlock the Power of AI-Driven Data Discovery
TrustArc Webinar - Unlock the Power of AI-Driven Data DiscoveryTrustArc
 
Strategies for Landing an Oracle DBA Job as a Fresher
Strategies for Landing an Oracle DBA Job as a FresherStrategies for Landing an Oracle DBA Job as a Fresher
Strategies for Landing an Oracle DBA Job as a FresherRemote DBA Services
 
Apidays New York 2024 - The value of a flexible API Management solution for O...
Apidays New York 2024 - The value of a flexible API Management solution for O...Apidays New York 2024 - The value of a flexible API Management solution for O...
Apidays New York 2024 - The value of a flexible API Management solution for O...apidays
 
The 7 Things I Know About Cyber Security After 25 Years | April 2024
The 7 Things I Know About Cyber Security After 25 Years | April 2024The 7 Things I Know About Cyber Security After 25 Years | April 2024
The 7 Things I Know About Cyber Security After 25 Years | April 2024Rafal Los
 
Developing An App To Navigate The Roads of Brazil
Developing An App To Navigate The Roads of BrazilDeveloping An App To Navigate The Roads of Brazil
Developing An App To Navigate The Roads of BrazilV3cube
 
Connector Corner: Accelerate revenue generation using UiPath API-centric busi...
Connector Corner: Accelerate revenue generation using UiPath API-centric busi...Connector Corner: Accelerate revenue generation using UiPath API-centric busi...
Connector Corner: Accelerate revenue generation using UiPath API-centric busi...DianaGray10
 
Finology Group – Insurtech Innovation Award 2024
Finology Group – Insurtech Innovation Award 2024Finology Group – Insurtech Innovation Award 2024
Finology Group – Insurtech Innovation Award 2024The Digital Insurer
 
2024: Domino Containers - The Next Step. News from the Domino Container commu...
2024: Domino Containers - The Next Step. News from the Domino Container commu...2024: Domino Containers - The Next Step. News from the Domino Container commu...
2024: Domino Containers - The Next Step. News from the Domino Container commu...Martijn de Jong
 
Data Cloud, More than a CDP by Matt Robison
Data Cloud, More than a CDP by Matt RobisonData Cloud, More than a CDP by Matt Robison
Data Cloud, More than a CDP by Matt RobisonAnna Loughnan Colquhoun
 
Workshop - Best of Both Worlds_ Combine KG and Vector search for enhanced R...
Workshop - Best of Both Worlds_ Combine  KG and Vector search for  enhanced R...Workshop - Best of Both Worlds_ Combine  KG and Vector search for  enhanced R...
Workshop - Best of Both Worlds_ Combine KG and Vector search for enhanced R...Neo4j
 
Axa Assurance Maroc - Insurer Innovation Award 2024
Axa Assurance Maroc - Insurer Innovation Award 2024Axa Assurance Maroc - Insurer Innovation Award 2024
Axa Assurance Maroc - Insurer Innovation Award 2024The Digital Insurer
 
🐬 The future of MySQL is Postgres 🐘
🐬  The future of MySQL is Postgres   🐘🐬  The future of MySQL is Postgres   🐘
🐬 The future of MySQL is Postgres 🐘RTylerCroy
 

Último (20)

Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
 
GenAI Risks & Security Meetup 01052024.pdf
GenAI Risks & Security Meetup 01052024.pdfGenAI Risks & Security Meetup 01052024.pdf
GenAI Risks & Security Meetup 01052024.pdf
 
Tata AIG General Insurance Company - Insurer Innovation Award 2024
Tata AIG General Insurance Company - Insurer Innovation Award 2024Tata AIG General Insurance Company - Insurer Innovation Award 2024
Tata AIG General Insurance Company - Insurer Innovation Award 2024
 
How to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected WorkerHow to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected Worker
 
Real Time Object Detection Using Open CV
Real Time Object Detection Using Open CVReal Time Object Detection Using Open CV
Real Time Object Detection Using Open CV
 
04-2024-HHUG-Sales-and-Marketing-Alignment.pptx
04-2024-HHUG-Sales-and-Marketing-Alignment.pptx04-2024-HHUG-Sales-and-Marketing-Alignment.pptx
04-2024-HHUG-Sales-and-Marketing-Alignment.pptx
 
ProductAnonymous-April2024-WinProductDiscovery-MelissaKlemke
ProductAnonymous-April2024-WinProductDiscovery-MelissaKlemkeProductAnonymous-April2024-WinProductDiscovery-MelissaKlemke
ProductAnonymous-April2024-WinProductDiscovery-MelissaKlemke
 
Powerful Google developer tools for immediate impact! (2023-24 C)
Powerful Google developer tools for immediate impact! (2023-24 C)Powerful Google developer tools for immediate impact! (2023-24 C)
Powerful Google developer tools for immediate impact! (2023-24 C)
 
TrustArc Webinar - Unlock the Power of AI-Driven Data Discovery
TrustArc Webinar - Unlock the Power of AI-Driven Data DiscoveryTrustArc Webinar - Unlock the Power of AI-Driven Data Discovery
TrustArc Webinar - Unlock the Power of AI-Driven Data Discovery
 
Strategies for Landing an Oracle DBA Job as a Fresher
Strategies for Landing an Oracle DBA Job as a FresherStrategies for Landing an Oracle DBA Job as a Fresher
Strategies for Landing an Oracle DBA Job as a Fresher
 
Apidays New York 2024 - The value of a flexible API Management solution for O...
Apidays New York 2024 - The value of a flexible API Management solution for O...Apidays New York 2024 - The value of a flexible API Management solution for O...
Apidays New York 2024 - The value of a flexible API Management solution for O...
 
The 7 Things I Know About Cyber Security After 25 Years | April 2024
The 7 Things I Know About Cyber Security After 25 Years | April 2024The 7 Things I Know About Cyber Security After 25 Years | April 2024
The 7 Things I Know About Cyber Security After 25 Years | April 2024
 
Developing An App To Navigate The Roads of Brazil
Developing An App To Navigate The Roads of BrazilDeveloping An App To Navigate The Roads of Brazil
Developing An App To Navigate The Roads of Brazil
 
Connector Corner: Accelerate revenue generation using UiPath API-centric busi...
Connector Corner: Accelerate revenue generation using UiPath API-centric busi...Connector Corner: Accelerate revenue generation using UiPath API-centric busi...
Connector Corner: Accelerate revenue generation using UiPath API-centric busi...
 
Finology Group – Insurtech Innovation Award 2024
Finology Group – Insurtech Innovation Award 2024Finology Group – Insurtech Innovation Award 2024
Finology Group – Insurtech Innovation Award 2024
 
2024: Domino Containers - The Next Step. News from the Domino Container commu...
2024: Domino Containers - The Next Step. News from the Domino Container commu...2024: Domino Containers - The Next Step. News from the Domino Container commu...
2024: Domino Containers - The Next Step. News from the Domino Container commu...
 
Data Cloud, More than a CDP by Matt Robison
Data Cloud, More than a CDP by Matt RobisonData Cloud, More than a CDP by Matt Robison
Data Cloud, More than a CDP by Matt Robison
 
Workshop - Best of Both Worlds_ Combine KG and Vector search for enhanced R...
Workshop - Best of Both Worlds_ Combine  KG and Vector search for  enhanced R...Workshop - Best of Both Worlds_ Combine  KG and Vector search for  enhanced R...
Workshop - Best of Both Worlds_ Combine KG and Vector search for enhanced R...
 
Axa Assurance Maroc - Insurer Innovation Award 2024
Axa Assurance Maroc - Insurer Innovation Award 2024Axa Assurance Maroc - Insurer Innovation Award 2024
Axa Assurance Maroc - Insurer Innovation Award 2024
 
🐬 The future of MySQL is Postgres 🐘
🐬  The future of MySQL is Postgres   🐘🐬  The future of MySQL is Postgres   🐘
🐬 The future of MySQL is Postgres 🐘
 

wget.pl

  • 1. All YOUR PAGE ARE BELONG TO US すべてのウェブページをこの手に 2012/11/16 株式会社はてな 大西康裕 id:onishi
  • 2.   id:onishi 大西康裕   ONISHI   @yasuhiro_onishi   株式会社はてな   はてなブログ
  • 6.
  • 7. HTML::Parser my $result; my $parser = HTML::Parser->new( start_h => [ sub {}, 'self,tagname,attr,text' ], default_h => [ sub {}, 'self,text' ], ); $parser->parse($content); print $result; • text • start • end • process • declaration • comment • default
  • 8. HTML::Parser start_h => [ sub { my($self, $tagname, $attr, $text) = @_; $result .= "<$tagname"; for my $key (sort keys %$attr) { my $value = $attr->{$key}; if ($key =~ /^(?:src)$/i) { # HTTP GET して保存してローカルパスにする $value = get_src($value); } $result .= qq{ $key="$value"}; } $result .= ">"; }, 'self,tagname,attr,text', ],
  • 9. HTML::Parser default_h => [ sub { my($self, $text) = @_; $result .= $text; }, 'self,text', ],
  • 10.
  • 11.
  • 12. CSSから参照 $content =~ s{url(([^)]+))}{ my $link = $1; # relative link (from HTML::ResolveLink) my $u = URI->new($link); unless (defined $u->scheme) { my $old = $u; $u = $u->abs($url); } $link = get_src($u); # HTTP GET して保存してローカルパスに "url($link)"; }eg;
  • 13. script 殺す my $context = { disallow => 0 }; my $disallow_tag = qr{script}; start_h => [sub { if ($tagname =~ /^(?:$disallow_tag)$/i) { $context->{disallow}++; return; } }], end_h => [sub { if ($tagname =~ /^(?:$disallow_tag)$/i) { $context->{disallow}--; return; } }], default_h => [sub { if ($context->{disallow} > 0) { return; } }],
  • 14. noscript 内を生かす my $nodisplay_tag = qr{noscript}; start_h => [sub { if ($tagname =~ /^(?:$nodisplay_tag)$/i) { return; } }], end_h => [sub { if ($tagname =~ /^(?:$nodisplay_tag)$/i) { return; } }],
  • 15. base start_h => [sub { if ($tagname =~ /^(?:base)$/i and $key =~ /^(?:href)$/i) { $value = "./"; } }],
  • 17. #!/usr/bin/env perl use strict; use warnings; use utf8; use DateTime; use Digest::SHA1 qw(sha1_hex); use Encode; use File::Path qw/make_path/; use HTML::Parser; use HTML::ResolveLink; use HTTP::Request::Common qw/GET/; use IO::All; use LWP::UserAgent; use URI; my $path = './'; my $uri = URI->new(shift) or die; my $now = DateTime->now; my $ymd = $now->ymd; my $ua = LWP::UserAgent->new(agent => 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)'); my $resolver = HTML::ResolveLink->new(base => $uri); my $res = $ua->request(GET $uri); my $content = $resolver->resolve($res->decoded_content); my $dir = $uri; $dir =~ s{[^A-Za-z0-9.]+}{-}g; $dir =~ s{-+$}{}; $dir = "$path/$dir/$ymd/"; $dir =~ s{/+}{/}g; make_path($dir); my $disallow_tag = qr{script}; my $nodisplay_tag = qr{noscript}; my $result; my $context = { disallow => 0 }; my $parser = HTML::Parser->new( api_version => 3, start_h => [ sub { my($self, $tagname, $attr, $text) = @_; if ($tagname =~ /^(?:$nodisplay_tag)$/i) { return; } elsif ($tagname =~ /^(?:$disallow_tag)$/i) { $context->{disallow}++; return; } $result .= "<$tagname"; for my $key (sort keys %$attr) { $key eq '/' and next; my $value = $attr->{$key}; if ($key =~ /^(?:src)$/i) { $value = get_src($value); } elsif ($tagname =~ /^(?:link)$/i and $key =~ /^(?:href)$/i) { $value = get_link($value); } elsif ($tagname =~ /^(?:base)$/i and $key =~ /^(?:href)$/i) { $value = $path; } $result .= qq{ $key="$value"}; } $result .= ">"; }, 'self,tagname,attr,text', ], end_h => [ sub { my($self, $tagname, $text) = @_; if ($tagname =~ /^(?:$nodisplay_tag)$/i) { return; } elsif ($tagname =~ /^(?:$disallow_tag)$/i) { $context->{disallow}--; return; } $result .= $text; }, 'self,tagname,text', ], default_h => [ sub { my($self, $text) = @_; if ($context->{disallow} > 0) { return; } $result .= $text; }, 'self,text', ], ); $parser->parse($content); $result =~ s{(<head[^>]*>)}{$1<meta http-equiv="Content-Type" content="text/html; charset=utf-8">}i; # XXX $result = Encode::encode('utf-8', $result); $result > io("${dir}index.html"); print "${dir}index.htmln"; sub get_src { my $src = shift or return; unless (-e "${dir}file") { make_path("${dir}file"); } my $file = $src; $file =~ s{[^A-Za-z0-9.]+}{-}g; if (length($file) > 255) { $file = sha1_hex($file); } $file = "file/$file"; $file =~ s{/+}{/}g; unless (-e "$dir$file") { $ua->request(GET $src)->content >> io("$dir$file"); sleep(1); # DOS対策対策 } $file; } sub get_link { my $url = shift or return; my $file = get_src($url); my $io = io("$dir$file"); my $content = $io->slurp; $content =~ s{url(([^)]+))}{ my $link = $1; $link =~ s{^[s"']+}{}; $link =~ s{[s"']+$}{}; # relative link (from HTML::ResolveLink) my $u = URI->new($link); unless (defined $u->scheme) { my $old = $u; $u = $u->abs($url); } $link = get_src($u); $link =~ s{^file/}{}; "url($link)"; }eg; $content > $io; return $file; }
  • 19.
  • 21.