2012-03-09

Haskellでwhichコマンド

基本機能だけです。

import qualified Control.Monad as CM
import qualified Data.Maybe as DM
import System.Directory (doesFileExist)
import System.Environment (getArgs)
import System.FilePath (getSearchPath, (</>))

findExe :: String -> IO (Maybe String)
findExe n = DM.listToMaybe `CM.fmap` (CM.filterM pred =<< getSearchPath)
    where
    pred p = doesFileExist $ p </> n

main :: IO ()
main = do
    ps <- getArgs
    CM.unless (null ps) $ findExe (head ps) >>= DM.maybe (return ()) putStrLn

2012-01-27

Haskellでwc -lを作った

プログラミング Haskell

String,ByteString,Textの速度比較を行うために、wc -lを作ってみました。
用意したコードは下記の通りです。

mywc_text.hs

{-# LANGUAGE OverloadedStrings #-}
import Control.Applicative ((<$>))
import qualified Data.Text.Lazy as LazyT
import qualified Data.Text.Lazy.IO as LazyTI
import System.Environment (getArgs)
import qualified System.IO as SI

main :: IO ()
main = do
    fp <- getArgs
    case fp of
        []  -> p LazyTI.getContents
        [f] -> SI.withFile f SI.ReadMode $ p . LazyTI.hGetContents
    where
    p i = print =<< LazyT.count "\n" <$> i

mywc_bytestring.hs

import Control.Applicative ((<$>))
import qualified Data.ByteString.Lazy.Char8 as LazyBS
import System.Environment (getArgs)
import qualified System.IO as SI

main :: IO ()
main = do
    fp <- getArgs
    case fp of
        []  -> p LazyBS.getContents
        [f] -> SI.withFile f SI.ReadMode $ p . LazyBS.hGetContents
    where
    p i = print =<< LazyBS.count '\n' <$> i

mywc_string.hs

import Control.Applicative ((<$>))
import System.Environment (getArgs)
import qualified System.IO as SI

main :: IO ()
main = do
    fp <- getArgs
    case fp of
        []  -> p SI.getContents
        [f] -> SI.withFile f SI.ReadMode $ p . SI.hGetContents
    where
    p i = print =<< length . lines <$> i

このmywc_text,mywc_bytestring,mywc_stringと、wcコマンドで行数カウントの速度を競ってもらいます。
※追記: ghc -O2でコンパイルしました。

対象データ: 10GiB / 218201000行のテキストファイル。前回同様.emacsを繋げて作成。
結果：

./mywc_text       ~/test.txt  98.07s user  2.93s system 86% cpu 1:57.16 total
./mywc_bytestring ~/test.txt  9.61s user   2.66s system 13% cpu 1:30.26 total
./mywc_string     ~/test.txt  324.95s user 4.52s system 97% cpu 5:37.19 total
wc -l             ~/test.txt  2.96s user   2.72s system  6% cpu 1:27.51 total

おまけ

wc.pl

#!/usr/bin/perl

use strict;
use warnings;
use utf8;

my $lines = 0;
my $buffer;
open(FILE, $ARGV[0]) or die "dieeeeee.$!";
while (sysread FILE, $buffer, 4096) {
    $lines += ($buffer =~ tr/\n//);
}
print $lines, "\n";

同じ試験用データ(10GiB / 218201000行のテキストファイル)をこのPerl スクリプトに食わせます。

./wc.pl ~/test.txt  9.17s user 3.31s system 12% cpu 1:36.35 total

2012-01-23

Haskellでtailコマンド - その2

プログラミング Haskell

前回作ったtailがあまりにも遅過ぎて酷かったので、全面的に書き直しました。
mmapは使ってません。conduitは使ってます。

{-
 - 方針
 -  Sourceでは、seekして後ろから4096バイトずつ(又はファイルサイズの1/10バイトずつ)取得する。
 -  Conduitで、後ろからn個目の改行がある所までで入力を切る。
 -  Sinkは後ろから順に積んでいく。
 -}

import Control.Applicative ((<$>))
import Control.Monad (when)
import Control.Monad.IO.Class (liftIO)
import qualified Data.ByteString as BS
import qualified Data.ByteString.Char8 as BC
import Data.Conduit ( ($$), ($=) )
import qualified Data.Conduit as C
import System.Environment (getArgs)
import qualified System.IO as SI

type FileSize = Integer


readLen :: FileSize -> Integer
readLen s
    | s < std  = std
    | otherwise = if x > limit then limit else x
    where
    x = s `div` 10
    std = 4096
    limit = 10 * 1024 * 1024


reverseFile :: C.ResourceIO m => FilePath -> C.Source m BS.ByteString
reverseFile fp = C.sourceIO initialize close pull
    where
    initialize = do
        h <- SI.openBinaryFile fp SI.ReadMode
        s <- readLen <$> SI.hFileSize h
        b <- SI.hIsSeekable h
        when b $ SI.hSeek h SI.SeekFromEnd (-1)
        return $ if b then Just (h,s) else Nothing
    close st
        | Nothing    <- st = return ()
        | Just (h,_) <- st = SI.hClose h
    pull st
        | Nothing     <- st = return C.Closed
        | Just (h,l)  <- st = do
            pos <- liftIO $ SI.hTell h
            if pos == 0
                then return C.Closed
                else do
                    let len = if pos < l then pos else l
                    liftIO $ SI.hSeek h SI.RelativeSeek (-len)
                    x <- liftIO $ BS.hGetSome h $ fromIntegral len
                    liftIO $ SI.hSeek h SI.RelativeSeek (-len)
                    return $ C.Open x


stackSink :: C.Resource m => C.Sink BS.ByteString m BS.ByteString
stackSink = C.sinkState BS.empty push close
    where
    close = return
    push st i = return (i `BS.append` st, C.Processing)
        

limitLinesReverse :: C.Resource m => Int -> C.Conduit BS.ByteString m BS.ByteString
limitLinesReverse count = C.conduitState count push close
    where
    close _ = return []
    push n i
        | n == 0    = return (n, C.Finished (Just i) [])
        | otherwise =  do
            let (n',bs) = loop n i
            if n' == 0
                then return (n', C.Finished Nothing [bs])
                else return (n', C.Producing [bs])
    loop n x
        | c <= n = (n - c, x)
        | c > n  = (0, BC.drop (pos - 1) x)
        where
        c = BC.count '\n' x
        pos = BC.elemIndices '\n' x !! (c - n)


main :: IO ()
main = do
    [fp, n] <- getArgs
    x <- C.runResourceT $ reverseFile fp $= limitLinesReverse (read n) $$ stackSink
    BC.putStrLn $ BC.drop 1 x

これをghc -O2 -o mytail でコンパイルしてOpenSuseのtailと比較してみました。
結果：

対象データ
    18MiB / 381091行 のテキストファイル(.emacsをずらずら繋げて作成)

500行読み込み
    自作tail    : ./mytail ~/test.txt 500  0.00s user 0.00s system 60% cpu 0.012 total
    Suseのtail  : tail -n 500 ~/test.txt  0.00s user 0.00s system 15% cpu 0.007 total

1000行読み込み
    自作tail    : ./mytail ~/test.txt 1000  0.00s user 0.01s system 41% cpu 0.029 total
    Suseのtail  : tail -n 1000 ~/test.txt  0.00s user 0.00s system 19% cpu 0.016 total

5000行読み込み
    自作tail    : ./mytail ~/test.txt 5000  0.00s user 0.01s system 18% cpu 0.082 total
    Suseのtail  : tail -n 5000 ~/test.txt  0.00s user 0.01s system 8% cpu 0.068 total

100000行読み込み
    自作tail    : ./mytail ~/test.txt 100000  0.01s user 0.21s system 14% cpu 1.438 total
    Suseのtail  : tail -n 100000 ~/test.txt  0.01s user 0.18s system 12% cpu 1.433 total

381091行(全部)読み込み
    自作tail    : ./mytail ~/test.txt 381091  0.04s user 0.69s system 13% cpu 5.546 total
    Suseのtail  : tail -n 381091 ~/test.txt  0.01s user 0.82s system 15% cpu 5.446 total

今日は気持ちよく寝れそうです。

2012-01-23

Haskellでtailコマンド

プログラミング Haskell

conduitを試しに使ってみようという事でtailコマンド書いてみました。
やれることはファイルの後ろから数行を表示するという基本だけ。
引数の処理はテキトーなので行数指定必須です。クソです。

{-# LANGUAGE OverloadedStrings #-}

import Control.Applicative ((<$>))
import Control.Monad.IO.Class (liftIO)
import qualified Data.ByteString.Char8 as BS
import Data.Conduit (($=), ($$))
import qualified Data.Conduit as C
import qualified Data.Conduit.List as CL
import Data.Int
import Data.Word (Word8)
import qualified Data.IORef as I
import System.Environment (getArgs)
import qualified System.IO as SI
import qualified System.IO.MMap as SIM

isEOL :: Char -> Bool
isEOL x = x == '\r' || x == '\n'

mmapSize :: Int64 -> Int
mmapSize fileSize
    | fs > maxSize = maxSize
    | otherwise    = fs
    where
    maxSize = 10 * 1024 * 1024
    fs = fromIntegral fileSize

reverseFile :: C.ResourceIO m => FilePath -> C.Source m BS.ByteString
reverseFile fp = C.sourceIO initialize close f
    where
    initialize = do
        offset <- fromIntegral <$> SI.withBinaryFile fp SI.ReadMode SI.hFileSize
        I.newIORef (mmapSize offset, offset - 1) -- offsetはファイル末端からmmapした領域の末端までの距離
    close = const $ return ()
    f s = do
        (mapSize, offset) <- liftIO (I.readIORef s)
        if offset <= 0
            then return C.Closed
            else do
                let msize = if fromIntegral mapSize > offset then offset else fromIntegral mapSize
                mmap <- liftIO $ SIM.mmapFileByteString fp (Just (offset - msize, fromIntegral msize))
                let (firstl, lastl) = BS.spanEnd (not . isEOL) mmap
                let buf = snd (BS.spanEnd isEOL firstl) `BS.append` lastl
                liftIO $ I.writeIORef s (mapSize, offset - fromIntegral (BS.length buf))
                return $ C.Open $ BS.copy buf


stackSink :: C.Resource m => C.Sink BS.ByteString m BS.ByteString
stackSink = C.sinkState BS.empty push return
    where
    push buf input = return (input `BS.append` buf, C.Processing)


main :: IO ()
main = do
    [fp,n] <- getArgs
    l <- BS.dropWhile isEOL <$> C.runResourceT (reverseFile fp $= CL.isolate (read n) $$ stackSink)
    BS.putStrLn l

conduitは書きやすくて再利用性が高いのがいいですね。

※suseのtailコマンドと速度を比較してみると30万行のデータを表示するのに111倍遅かったです。

2011-12-31

git rebase -iで過去のコミットを編集する

Git

git rebase -iで過去を改竄する。

git rebase -i ブランチ名で過去のコミットを編集することが出来ます。
細かいことはこちら古いコミットを書き換える：歴史修正主義者のための git rebase -i 入門 - 学習する機械、学習しない人間 Next Generation。

git rebase -iして出てくる指示書に何をしたいのか書き、過去の変更が始まったらそれぞれのフェーズにあった変更を行います。
変更が終わったら必要ならgit addし、git commit --amendして変更をコミットします。
git commit --amendすると変更内容のコミットと同時にcommitメッセージの変更を行えるので、適宜編集します。
コミットしたらgit rebase --continueで次のコミットの編集に移動します。

途中で失敗した場合はgit rebase --abortで全てを無かった事にしてgit rebase -iを行う直前に戻れます。

リモートリポジトリの過去を改竄する。

複数ヶ所のPCから同じリモートリポジトリに対して作業を行なっている場合、リモートリポジトリを通してやりかけの作業を共有できると便利です。
ただしこの場合、リモートリポジトリのコミットログを変更できないとコミットログがグッチャグチャになってしまいます。
これもgit rebase -iで解決できます。

git checkout -b ローカルのブランチ名 origin/リモートのブランチ名でブランチを切ります。
git rebase -iします。
git push origin +ローカルのブランチ名:リモートのブランチ名して強制的にコミットツリーを変更します。
他のリポジトリから変更を取得する際は、git fetchしてgit rebase origin/リモートのブランチ名としましょう。

なお、rebase -iで行った編集が、「HEADから順にいくつかのコミットを削除した」だけだと、他のcloneしたリポジトリで変更後のリモートリポジトリの内容をfetchしてrebaseしても、直接pullしても反映されないので注意。

2011-11-07

Vimからhlintを少し便利に使えるvim compiler plugin

プログラミング Haskell vim

http://blog.michaelrueegg.name/?p=209
こちらで紹介されているコンパイラプラグインを使うと、ページ下部の画像に表示されているような感じでhlintの出力する情報がvim上に表示されます。

2011-11-07

wxHaskellでXRCファイルを読み込む。

プログラミング Haskell

import Graphics.UI.WX
import Graphics.UI.WXCore

main :: IO ()
main = start $ do
    r <- xmlResourceCreateFromFile "hoge.xrc" wxXRC_USE_LOCALE
    f <- xmlResourceLoadFrame r objectNull "hogeFrame"
    b <- xmlResourceGetButton f "hogeButton"
    set b [ on command := close f ]
    windowShow f