Hatena::ブログ(Diary)

Kengo Ichiki Diary このページをアンテナに追加 RSSフィード

2008年01月02日(Wed)

[]ブログはてな移行 ブログのはてな移行  を含むブックマーク ブログのはてな移行  のブックマークコメント

  • 昨日はじめたブログ遊び、 予定通りスクリプトを書き上げた (というか、 diary2blosxom.pl をチョコチョコと修正しただけ)。 色々考えたけど、結局、一番単純なアプローチ、 つまり ultop level 毎に切り分けるだけ(深い所に anchor tag を振っているものは、 そのまま見ないふり)。 はてダラ をそのまま使えるようにするため、記事毎の切り出しから日付毎の切り出しに修正。 スクリプト名前は diary2hw.pl にした。 一応、ブログとして見れるところまでは出来た。(調子にのって2004年まで遡っておいたので、 既に1000日を越えたらしい。)
  • 現時点の問題点
  • 付記(1/7/2008): 何か今みたら、「*name*[category]header」と書けば、 名前付きでカテゴリー分けされたエントリーになっている模様。 script をそのように変更した。(これって、前から機能していたんだっけ?) ハイフンは依然ダメな模様。
  • 2/10/2008: 空のカテゴリーは消去されないのか?
  • 7/8/2008: 依然、なんとかならんのかと思いながら、半分あきらめながら。
  • 7/12/2008: ちと悪戦苦闘
  • 付記(7/13/2008): はてなのスーパーpre記法(ほんと、しつこいな)でコードを載っけるのもいいかなと思い、 diary2hw.pl のコードをここに張り付けておく:
    #!/usr/local/bin/perl
    # converter from my diary.html to hw, hatena diary writer
    # Copyright (C) 2004-2008 Kengo Ichiki <kengoichiki@yahoo.com>
    # $Id: diary2hw.pl,v 1.1 2008/01/02 20:52:28 ichiki Exp $
    use HTML::Parser;
    use Jcode;
    
    $topurl  = "http://d.hatena.ne.jp/kichiki";
    $oldurl  = "http://kichiki.web.fc2.com/diary";
    $year_start = 1997;
    
    @list0 = ; # old blosxom entry files
    
    # globals for check_category_in_link
    $check_link_tag = '';
    $check_link_li = 0;
    $check_link_text = '';
    
    # check for command-line argument
    die "Usage: diary2blosxom.pl (HTML file)\n" unless @ARGV == 1;
    
    # get the command-line argument
    my $file = shift;
    my $path, $file0;
    
    if ($file =~ /^diary\d{4}\_\d{2}.shtml$/) {
        $path = '';
    }
    elsif ($file =~ /(.*\/)(diary\d{4}\_\d{2}.shtml)$/) {
        $path = $1;
        $file0 = $2;
    }
    else {
        print "Illeagal diary file!\n";
        exit;
    }
    #print "PATH = $path\n";
    
    # Create HTML Parser object
    my $p = HTML::Parser->new
        (api_version => 3,
         start_h => [\&amp;start, "tagname,attr"],
         text_h  => [\&amp;text,  "dtext"],
         end_h   => [\&amp;end,   "tagname"],
         );
    
    die "File $file does't exist.\n" unless -e $file;
    open (HTML, "< $file") or die "Cannot open $file";
    
    $h1flag  = 0; # h1 tag
    $h2flag  = 0; # h2 tag
    $preflag = 0; # pre tag
    $nsaflag = 0; # not-shown anchor tag flag -- local copy, no-href tags
    $ullevel = 0; # ul level
    
    $year  = 0;
    $month = 0;
    $day   = 0;
    $num   = 1;
    
    $name  = "";
    $title = "";
    $body  = "";
    
    $daybody = "";
    
    while (<HTML>) {
        # convert input-line into utf8
      Jcode::convert ( \$_, "utf8");
        $p->parse($_);
    }
    close (HTML);
    
    foreach $file (@list0) {
        print "removed: $file\n";
        unlink ($file);
    }
    
    exit;
    
    
    sub start
    {
        my ($tagname, $attr) = @_;
        my $i;
        my $a, $c, $tmptag;
    
        # h1 tag
        if ($tagname eq "h1") {
    	$h1flag = 1;
        }
    
        # h2 tag
        elsif ($tagname eq "h2") {
    	$h2flag = 1;
        }
    
        # a tag
        elsif ($tagname eq "a") {
    	if ($ullevel == 0 &amp;&amp; $h2flag == 1) {
    	    $_ = $attr->{id};
    	    if (/d(\d)/) {
    		$day = $1;
    		$num   = 1; # reset counter
    	    }
    	}
    
    	if ($ullevel == 1 &amp;&amp; $attr->{id} ne "") {
    	    $_ = $attr->{id};
    	    if (/d(\d)-(.)/) {
    		my $subday  = $1;
    		if ($subday != $day) {
    		    print "WRONG!! id= ",$attr->{id},"\n";
    		}
    		$name = $2;
    	    }
    	}
    
    	if ($ullevel >= 1) {
    	    if ($attr->{href} eq "") {
    		# do nothing
    		$nsaflag = 1;
    	    }
    	    else {
    		$_ = $attr->{href};
    		$_ =~ s/^images\/AMAZON/http:\/\/images.amazon.com\/images\/P/;
    		$_ =~ s/^theorems.html/http:\/\/kichiki.web.fc2.com\/diary\/theorems.html/;
    		$_ =~ s/^summaries.html/http:\/\/kichiki.web.fc2.com\/diary\/summaries.html/;
    		$_ =~ s/^images\//http:\/\/kichiki.web.fc2.com\/diary\/images\//;
    		if (/^\#d(\d)-(.)$/) {
    		    if ($year >= $year_start) {
    			$_ = sprintf ("$topurl/%4d%02d%02d/%s",
    				      $year, $month, $1, $2);
    		    } else {
    			$_ = sprintf ("$oldurl/diary%4d_%02d.shtml\#d%d-%s",
    				      $year, $month, $1, $2);
    		    }
    		} elsif (/^diary(\d{4})\_(\d{2}).shtml$/) {
    		    if ($1 >= $year_start) {
    			$_ = sprintf ("$topurl/archive/%4d%02d", $1, $2);
    		    } else {
    			$_ = sprintf ("$oldurl/diary%4d_%02d.shtml", $1, $2);
    		    }
    		} elsif (/^diary(\d{4})\_(\d{2}).shtml\#d(\d)$/) {
    		    if ($1 >= $year_start) {
    			$_ = sprintf ("$topurl/%4d%02d%02d", $1, $2, $3);
    		    } else {
    			$_ = sprintf ("$oldurl/diary%4d_%02d.shtml\#d%d",
    				      $1, $2, $3);
    		    }
    		} elsif (/^diary(\d{4})\_(\d{2}).shtml\#d(\d)-(.)$/) {
    		    if ($1 >= $year_start) {
    			$_ = sprintf ("$topurl/%4d%02d%02d/%s",
    				      $1, $2, $3, $4);
    		    } else {
    			$_ = sprintf ("$oldurl/diary%4d_%02d.shtml\#d%d-%s",
    				      $1, $2, $3, $4);
    		    }
    		}
    
    		if ($_ eq "") {
    		    # do nothing
    		    $nsaflag = 1;
    		}
    		elsif (/^LOCAL\//) {
    		    # do nothing
    		    $nsaflag = 1;
    		}
    		else {
    		    $nsaflag = 0; # to show (for sure)
    		    if ($ullevel == 1) {
    			$title .= "<a href=\"$_\">";
    		    }
    		    else {
    			$body .= "<a href=\"$_\">";
    		    }
    		}
    	    }
    	}
        }
    
        # ul tag
        elsif ($tagname eq "ul") {
    	if ($ullevel >= 1) {
    	    #$body .= "\n";
    	    for ($i = 1; $i < $ullevel; $i ++) {
    		$body .= " ";
    	    }
    	    $body .= "<ul>\n";
    	}
    
    	$ullevel ++;
        }
    
        # li tag
        elsif ($tagname eq "li") {
    	if ($ullevel == 1) {
    	    # start of the entry
    	    $name  = "";
    	    $title = "";
    	    $body  = "";
    	}
    	elsif ($ullevel > 1) {
    	    for ($i = 2; $i < $ullevel; $i ++) {
    		$body .= " ";
    	    }
    	    $body .= "<li>\n";
    	}
        }
    
        # pre tag
        elsif ($tagname eq "pre") {
    	$preflag = 1;
    	if ($ullevel == 1) {
    	    $title .= "<pre>";
    	}
    	elsif ($ullevel > 1) {
    	    $body .= "<pre>";
    	}
        }
    
        # other tags
        else {
    	if ($ullevel >= 1) {
    	    $tmptag = "<$tagname";
    	    while (($a, $c) = each (%{$attr})) {
    		# for amazon images
    		if (($tagname eq "img")
    		    and ($a eq "src")) {
    		    if ($c =~ /^images\/AMAZON/) {
    			$c =~ s/^images\/AMAZON/http:\/\/images.amazon.com\/images\/P/;
    		    }
    		    elsif ($c =~ /^images/) {
    			$c =~ s/^images\//http:\/\/kichiki.web.fc2.com\/diary\/images\//;
    		    }
    		}
    
    		if ($a ne "/") {
    		    $tmptag .= " $a=\"$c\"";
    		}
    	    }
    	    if (($tagname eq "img")
    		or ($tagname eq "hr")
    		or ($tagname eq "br")) {
    		$tmptag .= " />";
    	    }
    	    else {
    		$tmptag .= ">";
    	    }
    
    	    if ($ullevel == 1) {
    		$title .= $tmptag;
    	    }
    	    else {
    		$body .= $tmptag;
    	    }
    	}
        }
    }
    
    sub text
    {
        my ($dtext) = shift;
    
        $dtext =~ s/&amp;/&amp;/g;
        if ($preflag != 1) {
    	$dtext =~ s// /g;
    	$dtext =~ s/^\n//g;
        }
    #    else {
    #	$dtext =~ s/</</g;
    #	$dtext =~ s/>/>/g;
    #	$dtext =~ s/"/"/g;
    #    }
    
        if ($h1flag == 1) {
    	#print "dtext = $dtext\n";
    	$_ = $dtext;
    	if (/(\d)(\d)/) {
    	    $year  = $1;
    	    $month = $2;
    	    #print "year = $year\nmonth = $month\n";
    
    	    @list0 = getentries (".", $year, $month);
    	}
        }
    
        if ($preflag == 1) {
    	$dtext =~ s/\*\*\*/***/g;
    	$dtext =~ s/\-\-\-/ーーー/g;
    	$dtext =~ s/\+//g;
        }
    
        if ($nsaflag == 0) {
    	if ($ullevel == 1) {
    	    $title .= $dtext;
    	}
    	elsif ($ullevel > 1) {
    	    $body .= $dtext;
    	}
        }
    }
    
    sub end
    {
        my ($tagname) = shift;
        my $i;
        my $blosxomfile;
        my $localtitle;
    
        # h1 tag
        if ($tagname eq "h1") {
    	$h1flag = 0;
        }
    
        # h2 tag
        elsif ($tagname eq "h2") {
    	$h2flag = 0;
        }
    
        # ul tag
        elsif ($tagname eq "ul") {
    	$ullevel --;
    	if ($ullevel >= 1) {
    	    #$body .= "\n";
    	    for ($i = 1; $i < $ullevel; $i ++) {
    		$body .= " ";
    	    }
    	    $body .= "</ul>\n";
    	} else {
    	    # end of the day
    	    my $dayfile = sprintf ("%4d-%02d-%02d",
    				   $year, $month, $day);
    	    print_day_entry ($dayfile,
    			     $daybody, $year, $month, $day);
    	    # for sure
    	    $daybody = "";
    	}
        }
    
        # a tag
        elsif ($tagname eq "a") {
    	if ($ullevel >= 1 &amp;&amp; $nsaflag == 0) {
    	    if ($ullevel == 1) {
    		$title .= "</a>";
    	    }
    	    elsif ($ullevel > 1) {
    		$body .= "</a>";
    	    }
    	}
    	else {
    	    $nsaflag = 0;
    	}
        }
    
        # li tag
        elsif ($tagname eq "li") {
    	if ($ullevel == 1) {
    	    $blosxomfile = sprintf ("%4d-%02d-%02d-%02d",
    				    $year, $month, $day, $num);
    
    	    # end of the entry
    	    #print_entry ($blosxomfile,
    	    #		 $name, $title, $body,
    	    #		 $year, $month, $day, $num);
    	    if ($name ne "") {
    		$daybody .= "*$name*$title\n";
    	    } else {
    		$daybody .= "*$title\n";
    	    }
    	    # cut extra space in front of [] if exist
    	    $daybody =~ s/\*()\[/\*\[/;
    	    $daybody .= "$body\n";
    
    	    # for sure
    	    $num ++;
    	    $name  = "";
    	    $title = "";
    	    $body  = "";
    	}
    	elsif ($ullevel > 1) {
    	    #$body .= "\n";
    	    for ($i = 1; $i < $ullevel; $i ++) {
    		$body .= " ";
    	    }
    	    $body .= "</li>\n";
    	}
        }
    
        # pre tag
        elsif ($tagname eq "pre") {
    	$preflag = 0;
    	if ($ullevel == 1) {
    	    $title .= "</pre>";
    	}
    	elsif ($ullevel > 1) {
    	    $body .= "</pre>";
    	}
        }
    
        # other tags
        else {
    	if ($ullevel >= 1) {
    	    if ($ullevel == 1) {
    		$title .= "</$tagname>";
    	    }
    	    else {
    		$body .= "</$tagname>";
    	    }
    	}
        }
    }
    
    sub print_day_entry {
        my ($f, # file
    	$b, # body
    	$y, # year (4 digits)
    	$m, # month (1 - 12)
    	$d, # day   (1 - 31)
    	) = @_;
    
        my @txtmon = ("Jan", "Feb", "Mar", "Apr", "May", "Jun",
    		  "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "13");
        my $filename = '';
    
        $t =~ s/\n//g;
        $t =~ s/^ +//g;
        $t =~ s/ +$//g;
        $t =~ s/  / /g;
        $t =~ s/\Q\E//g;
        if ($t eq "") {
    	$t = "-";
        }
    
        $b =~ s/\Q\E//g;
    
        $filename = "./$f.txt";
    
        open (FH, "> tmp.txt");
        print (FH "\n");
        print (FH "$b\n");
        close (FH);
    
        my $flag_match = 0;
        my $count = 0;
        foreach my $file (@list0) {
    	if ($file eq $filename) {
    	    $flag_match = 1;
    	    if (!compare_files ("tmp.txt", $file)) {
    		# modified
    		print ("updated: $filename\n");
    		rename ("tmp.txt", $filename);
    	    }
    	    else {
    		# same
    		unlink ("tmp.txt");
    	    }
    	    # remove the file from @list0
    	    splice (@list0, $count, 1);
    
    	    return;
    	}
    	$count ++;
        }
    
        if ($flag_match == 0) {
    	# new entry
    	print ("new:     $filename\n");
    	rename ("tmp.txt", $filename);
        }
    }
    
    
    # "getfiles" is borrowed from
    # Blosxom Plugin: recentwritebacks_tree
    # Author(s): typester <typester@unknownplace.org>
    # Version: 1.0
    # Blosxom Home/Docs/Licensing: http://www.blosxom.com/
    sub getfiles {
        my $dir = $_[0];
        my @files = ;
        my @ret = ;
    
        $dir .= "/" if ($dir =~ /[^\/]$/);
        if (opendir (DIR, $dir)) {
    	@files = readdir (DIR);
    	closedir (DIR);
        }
    
        foreach my $file (@files) {
    	next if ($file eq '.' or $file eq '..');
    	if (-d "$dir$file" and $file ne '') {
    	    my @subdir = &amp;getfiles("$dir$file");
    	    @ret = (@ret, @subdir);
    	}
    	else {
    	    push (@ret, "$dir$file");
    	}
        }
    
        return @ret;
    }
    
    sub getentries {
        my ($dir, $year, $month) = @_;
        my @ret = ;
    
        my @files = getfiles ($dir);
        my $str_ym = sprintf ("%4d-%02d", $year, $month);
    
        foreach my $file (@files) {
    	if ($file =~ /$str_ym.\.txt/) {
    	    push (@ret, "$file");
    	}
        }
    
        return @ret;
    }
    
    # return true when two files are same
    sub compare_files {
        my ($f1, $f2) = @_;
        my $l1 = '';
        my $l2 = '';
        my $ret = 1; # true
    
        my $s1 = -s $f1;
        my $s2 = -s $f2;
        if ($s1 != $s2) {
    	$ret = 0; # false
        }
        else {
    	open (F1, "< $f1");
    	open (F2, "< $f2");
    
          COMPARE: while ($l1 = <F1> and
    		      $l2 = <F2>) {
    	  if ($l1 ne $l2) {
    	      $ret = 0; # false
    	      last COMPARE;
    	  }
          }
    	close (F1);
    	close (F2);
        }
    
        return $ret;
    }
    
    
    # check link
    sub check_link_start
    {
        my ($tagname, $attr) = @_;
        # $check_link_li and $check_link_tag are global.
    
        # a tag
        if ($tagname eq "a") {
    	if ($attr->{id} eq $check_link_tag) {
    	    $check_link_li = 1;
    	}
        }
    }
    
    sub check_link_text
    {
        my ($dtext) = shift;
        # $check_link_li and $check_link_text are global.
    
        $dtext =~ s/&amp;/&amp;/g;
        $dtext =~ s// /g;
        $dtext =~ s/^\n//g;
    
        if ($check_link_li == 1) {
    	$check_link_text .= $dtext;
        }
    }
    
    sub check_link_end
    {
        my ($tagname) = shift;
        # $check_link_li is global.
    
        # li tag
        if ($tagname eq "li" and $check_link_li == 1) {
    	$check_link_li = 0;
        }
    }
    
    cf. 7/12/2008: はてな名前付きタグの件。
  • 5/1/2010: blog 復活(で、結構、手間取ったよ)。
  • 1/8/2011: fc2.com への引越し

Connection: close