文字列の分割 - ITコンサルの日常

プログラミングC# p248

String#Splitを使って、HTTPヘッダをフィールド名と値に分けます。

using System;
using System.IO;

class ReadHeader
{
        static void Main(string[] args)
        {
                FileInfo file = new FileInfo("header.txt");

                using(StreamReader sr = file.OpenText())
                {
                        // 最初の一行はステータス行のため読み飛ばし
                        sr.ReadLine();

                        string text;
                        while((text = sr.ReadLine()) != null)
                        {
                                char[] splitChars = {':'};
                                string[] headerValue = text.Split(splitChars);
                                Console.WriteLine("HeaderName = {0} / Value = {1}", headerValue[0], headerValue[1]);
                        }
                }
        }
}

結果はこう。

HeaderName = Date / Value =  Tue, 12 Feb 2008 14
HeaderName = Server / Value =  Apache
HeaderName = Content-Type / Value =  text/html; charset=euc-jp
HeaderName = Vary / Value =  Accept-Encoding
HeaderName = Connection / Value =  close

意図せず、時間(14:58:27)がSplitされてしまっている。
これをうまくいくよう、わざわざ正規表現を使ってやってみる。

using System;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;

class ReadHeader
{
        static void Main(string[] args)
        {
                FileInfo file = new FileInfo("header.txt");

                using(StreamReader sr = file.OpenText())
                {
                        // 最初の一行はステータス行のため読み飛ばし
                        sr.ReadLine();

                        Regex theReg = new Regex("^([^:]+): (.*)$");

                        string text;
                        while((text = sr.ReadLine()) != null)
                        {
                                MatchCollection theMatches = theReg.Matches(text
);
                                foreach(Match theMatch in theMatches)
                                {
                                        Console.WriteLine("HeaderName = {0} / Va
lue = {1}", theMatch.Groups[1].Value, theMatch.Groups[2].Value);
                                }
                        }
                }
        }
}

行頭からコロンじゃない文字が1つ以上続き、コロンとスペースが現れた後、任意の文字が行末まで続くという正規表現を定義してみた。
結果はこう。

HeaderName = Date / Value = Tue, 12 Feb 2008 14:58:27 GMT
HeaderName = Server / Value = Apache
HeaderName = Content-Type / Value = text/html; charset=euc-jp
HeaderName = Vary / Value = Accept-Encoding
HeaderName = Connection / Value = close

時間も正しく表示できた。

正規表現 Java版

import java.io.*;
import java.util.regex.*;

class ReadHeaderRegex
{
        public static void main(String[] args) throws Exception
        {
                BufferedReader br = null;

                try
                {
                        br = new BufferedReader(new FileReader("header.txt"));

                        // 最初の一行はステータス行のため読み飛ばし
                        br.readLine();

                        Pattern p = Pattern.compile("^([^:]+): (.*)$");

                        String text = null;
                        while((text = br.readLine()) != null)
                        {
                                Matcher m = p.matcher(text);
                                if(m.matches())
                                {
                                        System.out.printf("HeaderName = %s / Val
ue = %s\n", m.group(1), m.group(2));
                                }
                        }
                }
                finally
                {
                        if(br != null)
                        {
                                br.close();
                        }
                }
        }
}

正規表現の書き方自体はもちろん同じ。結果ももちろん同じ。

正規表現 Ruby版

theReg = Regexp.new("^([^:]+): (.*)$");

open("header.txt") { |file|
        # 最初の一行はステータス行のため読み飛ばし
        file.gets

        while text = file.gets do
                matchData = theReg.match(text)
                puts "HeaderName = #{matchData[1]} / Value = #{matchData[2]}"
        end
}

すっきり！(^o^)
くどいようだが、正規表現の書き方自体はもちろん同じ。結果ももちろん同じ。

正規表現 VBScript版

Set fs = CreateObject("Scripting.FileSystemObject")
Set ts = fs.OpenTextFile("header.txt")

Set theReg = New Regexp
theReg.Pattern = "^([^:]+): (.*)$"

' 最初の一行はステータス行のため読み飛ばし
ts.ReadLine()

While ts.AtEndOfStream = False
        lineData = ts.ReadLine()

        Set matches = theReg.Execute(lineData)

        WScript.Echo "HeaderName = " & matches(0).submatches(0) & " / Value = "
& matches(0).submatches(1)
Wend

ts.Close()

正規表現版JScript版

var fs = new ActiveXObject("Scripting.FileSystemObject");
var ts = fs.OpenTextFile("header.txt");

var theReg = /^([^:]+): (.*)$/;

// 最初の一行はステータス行のため読み飛ばし
ts.ReadLine();

while(!ts.AtEndOfStream)
{
        lineData = ts.ReadLine();

        var matches = theReg.exec(lineData);

        WScript.Echo("HeaderName = " + RegExp.$1 + " / Value = " + RegExp.$2);
}

ts.Close();

正規表現版perl版

open(FILE, "header.txt");

# Skip Status Line

while(<FILE>)
{
        chomp;

        if($_ =~ /^([^:]+): (.*)$/)
        {
                print "Header = $1 / Data = $2\n";
        }
}

close(FILE);

perl版が一番手になじんだ気がした。

それにしても、カッコでくくって後から使う機能は「後方参照」とか呼ぶみたいですけど、awkとかじゃ使えないんですね。意外。