Rust 入門 簡単な関数を書いてみる(5)

Rust 入門続き。

住所を正規化する関数、ひとまず完成。 住所の文字列から、都道府県、市区町村、番地、建物を分割し表記を正規化した構造体を返す。

文字列を文字単位に分割するには、chars() を呼び出せばよいというのはすぐに分かったが、char からユニコードのコードポイントに変換するのにちょっと迷った。
答えは知ってみれば単純で、単に u32にキャストしてやれば良いだけだった。逆にコードポイントから char に変換するには、

char::from_u32(0xff10).unwrap()

とすれば良い。

関数はトレイトを利用して構造体の静的メソッド風に呼び出されるようにしてみた。
Rustにはクラスはないが、トレイトを定義することで構造体にメソッドを生やすことができる。ここらへん golang に似てる。

extern crate regex;
use regex::Regex;
use std::collections::HashMap;

const PREFECTURES:[(&str, &str); 47] = [
        ("01","北海道"), ("02","青森県"), ("03","岩手県"), ("04","宮城県"), ("05","秋田県"),
        ("06","山形県"), ("07","福島県"), ("08","茨城県"), ("09","栃木県"), ("10","群馬県"),
        ("11","埼玉県"), ("12","千葉県"), ("13","東京都"), ("14","神奈川県"), ("15","新潟県"),
        ("16","富山県"), ("17","石川県"), ("18","福井県"), ("19","山梨県"), ("20","長野県"),
        ("21","岐阜県"), ("22","静岡県"), ("23","愛知県"), ("24","三重県"), ("25","滋賀県"),
        ("26","京都府"), ("27","大阪府"), ("28","兵庫県"), ("29","奈良県"), ("30","和歌山県"),
        ("31","鳥取県"), ("32","島根県"), ("33","岡山県"), ("34","広島県"), ("35","山口県"),
        ("36","徳島県"), ("37","香川県"), ("38","愛媛県"), ("39","高知県"), ("40","福岡県"),
        ("41","佐賀県"), ("42","長崎県"), ("43","熊本県"), ("44","大分県"), ("45","宮崎県"),
        ("46","鹿児島県"), ("47","沖縄県")    
    ];

const HANKAKU_ZENKAKU_KANA_CHARS:[(&str, &str); 89] = [
    ("ヴ", "ヴ"), 
    ("ガ", "ガ"), ("ギ", "ギ"), ("グ","グ"), ("ゲ", "ゲ"), ("ゴ","ゴ"), 
    ("ザ","ザ"), ("ジ","ジ"), ("ズ","ズ"), ("ゼ","ゼ"), ("ゾ","ゾ"), 
    ("ダ","ダ"), ("ヂ","ヂ"), ("ヅ","ヅ"), ("デ","デ"), ("ド","ド"), 
    ("バ","バ"), ("ビ","ビ"), ("ブ","ブ"), ("ベ","ベ"), ("ボ","ボ"), 
    ("パ","パ"), ("ピ","ピ"), ("プ","プ"), ("ペ","ペ"), ("ポ","ポ"), 
    ("ア","ア"), ("イ","イ"), ("ウ","ウ"), ("エ","エ"), ("オ","オ"),
    ("カ","カ"), ("キ","キ"), ("ク","ク"), ("ケ","ケ"), ("コ","コ"),
    ("サ","サ"), ("シ","シ"), ("ス","ス"), ("セ","セ"), ("ソ","ソ"),
    ("タ","タ"), ("チ","チ"), ("ツ","ツ"), ("テ","テ"), ("ト","ト"),
    ("ナ","ナ"), ("ニ","ニ"), ("ヌ","ヌ"), ("ネ","ネ"), ("ノ","ノ"),
    ("ハ","ハ"), ("ヒ","ヒ"), ("フ","フ"), ("ヘ","ヘ"), ("ホ","ホ"),
    ("マ","マ"), ("ミ","ミ"), ("ム","ム"), ("メ","メ"), ("モ","モ"),
    ("ヤ","ヤ"), ("ユ","ユ"), ("ヨ","ヨ"), 
    ("ラ","ラ"), ("リ","リ"), ("ル","ル"), ("レ","レ"), ("ロ","ロ"),
    ("ワ","ワ"), ("ヲ","ヲ"), ("ン","ン"),
    ("ァ","ァ"), ("ィ","ィ"), ("ゥ","ゥ"), ("ェ","ェ"), ("ォ","ォ"),
    ("ャ","ャ"), ("ュ","ュ"), ("ョ","ョ"), ("ッ","ッ"),
    ("、","、"), ("。","。"), ("ー","ー"), ("「","「"), ("」","」"),
    ("゙","”"), ("゚","'"), ("・","・")    
];

// 漢数字をアラビア数字に変換する
fn kanji_numeral_to_arabic_numerals(s: &str) -> String {

    // 漢数字を半角数字に置換する
    fn z2h(s: &str) -> String {
        s.replace("一", "1")
        .replace("壱", "1")
        .replace("1", "1")
        .replace("二", "2")
        .replace("弐", "2")
        .replace("2", "2")
        .replace("三", "3")
        .replace("参", "3")
        .replace("3", "3")
        .replace("四", "4")
        .replace("4", "4")
        .replace("五", "5")
        .replace("5", "5")
        .replace("六", "6")
        .replace("6", "6")
        .replace("七", "7")
        .replace("7", "7")
        .replace("八", "8")
        .replace("8", "8")
        .replace("九","9")
        .replace("9", "9")
        .replace("〇", "0")
        .replace("0", "0")
    };

    // 変換処理
    fn convert(s: &str, re: &Regex) -> i64 {

        let mut digits: HashMap<&str, i64> = HashMap::new();
        digits.insert("十", 10);
        digits.insert("拾", 10);
        digits.insert("百", 100);
        digits.insert("千", 1000);
        digits.insert("万", 10000);
        digits.insert("億", 100000000);
        digits.insert("兆", 1000000000000);

        let reg_divide_digit_unit:Regex = Regex::new("[十拾百千]|\\d+").unwrap();
        let reg_arabic_numerals:Regex = Regex::new("^[0-9]+$").unwrap();

        let mut unit:i64 = 1;
        let mut result:i64 = 0;
        for piece in re.find_iter(s).map(|m| m.as_str()).collect::<Vec<&str>>().iter().rev() {
            if digits.contains_key(piece) {
                if unit > 1 {
                    result += unit;
                }
                unit = digits[piece];
            } else {
                if reg_arabic_numerals.is_match(piece) {
                    result += piece.parse::<i64>().unwrap() * unit;
                } else {
                    result += convert(piece, &reg_divide_digit_unit) * unit;
                };
                unit = 1;            
            }
        }
        if unit > 1 {
            result += unit;
        }

        result
    };
    
    convert(&z2h(s), &Regex::new("[万億兆]|[^万億兆]+").unwrap()).to_string()
}

// 住所から都道府県とそれ以降を分割する
fn get_prefecture(address: &str) -> (String, String) {
    let prefectures = PREFECTURES.iter().map(|x| x.1).collect::<Vec<&str>>().join("|");
    let pattern = format!("^({})(.+*)$", prefectures);
    let re = Regex::new(&pattern).unwrap();

    match re.captures(address) {
        Some(m) => ((&m[1]).to_string(), (&m[2]).to_string()),
        None => ("".to_string(), address.to_string())
    }
}

// 住所から市区町村とそれ以降を分割する
fn get_city(address: &str) -> (String, String) {
    let re1 = Regex::new(concat!(
        "^(余市郡(仁木町|赤井川村|余市町)|余市町|柴田郡村田町|(武蔵|東)村山市|",
        "[東西北]村山郡...?町|田村(市|郡..町)芳賀郡市貝町|(佐波郡)?玉村町|[羽大]村市|",
        "(十日|大)町市|(中新川郡)?上市町|(野々|[四廿]日)市市|西八代郡市川三郷町|",
        "神崎郡市川町|高市郡(高取町|明日香村)|(吉野郡)?下市町|(杵島郡)?大町町)(.+)"
    )).unwrap();
    let re2 = Regex::new("^(.+[市区町村])(.+)").unwrap();

    match re1.captures(address) {
        Some(m) => {
            match re2.captures(&m[12]) {
                Some(m2) => (format!("{}{}",(&m[1]),(&m2[1])), (&m2[2]).to_string()),
                None => ((&m[1]).to_string(), (&m[12]).to_string())
            }
        },
        None => {
            match re2.captures(address) {
                Some(m2) => ((&m2[1]).to_string(), (&m2[2]).to_string()),
                None => ("".to_string(), address.to_string())
            }
        }
    }
}

// 半角カナを全角カナに、全角英数を半角英数に変換する
fn pre_convert(s: &str) -> (String) {
    let mut tmp = s.replace(" ", " ");
    for (hankaku, zenkaku) in HANKAKU_ZENKAKU_KANA_CHARS.iter() {
        tmp = tmp.replace(hankaku, zenkaku);
    }

    tmp.chars().map(|ch| {
        let code_point = ch as u32;
        if code_point >= 0xff10 && code_point <= 0xff51 {
            std::char::from_u32(code_point - 0xfee0).unwrap()
        } else {
            ch
        }
    }).collect()
}

// 建物名の正規化
fn normalize_building_name(name: &str) -> String {
    let re = Regex::new("(.+?)(([\\d〇一二三四五六七八九十百千万]+)(階|F|F|号|号室))$").unwrap();
    match re.captures(name) {
        Some(m) => {
            let floor = kanji_numeral_to_arabic_numerals(&m[3]);
            let mut suffix = &m[4];
            if suffix == "F" {
                suffix = "階"
            }
            format!("{}{}{}", &m[1], floor, suffix)
        },
        None => {
            name.to_string()
        }
    }
}

// 住所から番地と建物名に分割する
fn get_address(address: &str) -> (String, String) {
    let all_num = "[\\d一二三四五六七八九十百千万]+";
    let pattern = format!("(.*?)({}({}|丁目|丁|番地|番|号|-|‐|ー|−|の|東|西|南|北)*({}|丁目|丁|番地|番|号))(.*)",
                    all_num, all_num, all_num);
    let re = Regex::new(&pattern).unwrap();
    match re.captures(address) {
        Some(m) => {
            let re2 = Regex::new(all_num).unwrap();
            if re2.is_match(&m[2]) {
                let address_number = re2.captures_iter(&m[2]).map(|x| (&x[0]).to_string()).collect::<Vec<String>>().join("-");
                (format!("{}{}", &m[1], address_number), normalize_building_name(&m[5]))
            } else {
                (format!("{}{}", &m[1], &m[2]), normalize_building_name(&m[5]))
            }
        },
        None => {
            (address.to_string(), "".to_string())
        }
    }
  }

// 住所構造体 
#[derive(Debug)]
pub struct Address {
    prefecture: String,
    city: String,
    address: String,
    building_name: String
}

impl Address {
    // 文字列から住所を正規化して構造体として返す
    pub fn from(addr: &str) -> Address {
        let (prefecture, tmp1) = get_prefecture(&pre_convert(addr));
        let (city, tmp2) = get_city(&tmp1);
        let (address, building_name) = get_address(&tmp2);

        Address {
            prefecture,
            city,
            address,
            building_name
        }
    }
}