Source code for cardbuilder.lookup.en_to_ja.eijiro

import re
from collections import defaultdict
from logging import WARNING
from os.path import abspath
from string import digits
from typing import Tuple, Iterable, Optional

from cardbuilder.common.config import Config
from cardbuilder.common import Fieldname
from cardbuilder.common.util import fast_linecount, loading_bar, log
from cardbuilder.exceptions import CardBuilderException, WordLookupException
from cardbuilder.input.word import Word
from cardbuilder.lookup.data_source import ExternalDataDataSource
from cardbuilder.lookup.lookup_data import outputs, LookupData
from cardbuilder.lookup.value import LinksValue, MultiListValue

digitset = set(digits)

# these definitions live outside the eijiro class so they can be used in the @output definition
example_sentence_symbol = '■・'
additional_explanation_symbol = '◆'
pronunciation_symbol = '【発音】'
pronunciation_important_symbol = '【発音！】'
katakana_reading_symbol = '【＠】'
inflections_symbol = '【変化】'
level_symbol = '【レベル】'
word_split_symbol = '【分節】'
link_symbol = '＝<→'
second_link_symbol = '<→'  # links don't always start with =

content_sectioning_symbol_map = {
    example_sentence_symbol: Fieldname.EXAMPLE_SENTENCES,
    additional_explanation_symbol: Fieldname.SUPPLEMENTAL,
    pronunciation_symbol: Fieldname.PRONUNCIATION_IPA,
    pronunciation_important_symbol: Fieldname.PRONUNCIATION_IPA,
    katakana_reading_symbol: Fieldname.KATAKANA,
    inflections_symbol: Fieldname.INFLECTIONS,
    level_symbol: Fieldname.SUPPLEMENTAL,
    word_split_symbol: Fieldname.SUPPLEMENTAL,
    link_symbol: Fieldname.LINKS,
    second_link_symbol: Fieldname.LINKS
}


[docs]@outputs({**{Fieldname.LINKS: LinksValue, Fieldname.DEFINITIONS: MultiListValue},
          **{fname: MultiListValue for fname in content_sectioning_symbol_map.values() if fname != Fieldname.LINKS}})
class Eijiro(ExternalDataDataSource):
    # http://www.eijiro.jp/get-144.htm
    # https://www.eijiro.jp/spec.htm

    eijiro_conf_value = 'eijiro_loaded'

    line_head_symbol = '■'
    entry_delimiter = ' : '

    content_sectioning_symbols = set(content_sectioning_symbol_map.keys())

    content_sectioning_regex = re.compile(r'({})'.format('|'.join(re.escape(x) for x in content_sectioning_symbols)))

    # we don't currently use these two, but they're present in many headers
    blank_indirect_obj = '__'
    blank_direct_obj = '‾'

    header_pos_regex = re.compile(r'\{.+\}')

    pos_dictionary = {key[1:-1]: value for key, value in {
        '{名}': '名詞',
        '{代}': '代名詞',
        '{形}': '形容詞',
        '{動}': '動詞',
        '{他動}': '他動詞',
        '{自動}': '自動詞',
        '{助}': '助動詞',
        '{句動}': '句動詞',
        '{副}': '副詞',
        '{接}': '接続詞',
        '{間}': '間投詞',
        '{前}': '前置詞',
        '{略}': '略語',
        '{組織}': '組織名（会社名、団体名など）',
        '【反】': '反意語',
        '【対】': '対語',
        '【名】': '名詞形',
        '【類】': '類語',
        '【動】': '動詞形',
        '【同】': '同意語',
        '《イ》': 'インターネット',
        '《コ》': 'コンピュータ',
        '《レ》': 'レターやEメールの文例',
        '《医》': '医学',
        '《薬》': '薬学',
        '《化》': '化学',
        '〈米〉': 'アメリカ英語',
        '〈英〉': 'イギリス英語',
        '〈豪〉': 'オーストラリア英語',
        '〈NZ〉': 'ニュージーランド英語',
        '〈アイル〉': 'アイルランド方言',
        '〈スコ〉': 'スコットランド方言',
        '〈俗〉': '俗語',
        '〈米俗〉': 'アメリカの俗語',
        '〈話〉': '話し言葉（口語表現）',
        '〈文〉': '文語（書き言葉）',
        '〈米話〉': 'アメリカの話し言葉（口語表現）',
        '〈野球俗〉': '野球で使われる俗語',
        '〈米海軍俗〉': '米海軍で使われる俗語'
    }.items()}

    header_data_delimiter = '⦀'
    line_data_delimiter = '⚬'

    def _read_and_convert_data(self) -> Iterable[Tuple[str, str]]:
        if self.file_loc is None:
            raise FileNotFoundError('Must set Eijiro location the first time for data ingestion')
        lines = fast_linecount(self.file_loc)
        prev_word = None
        prev_content = None
        for line in loading_bar(open(self.file_loc, 'r', encoding='shift_jisx0213'), 'reading eijiro', lines):
            header_end = line.index(Eijiro.entry_delimiter)
            header = line[1:header_end].strip()  # start at 1 to drop the ■
            content = line[header_end + len(Eijiro.entry_delimiter):].strip()

            pos_marking_match = next(Eijiro.header_pos_regex.finditer(header), None)
            if pos_marking_match is not None:
                pos_content = pos_marking_match.group(0)[1:-1]
                word = header[:pos_marking_match.start()].strip()
                if '-' in pos_content:
                    pos = next(x.strip() for x in pos_content.split('-') if x.strip() not in digitset)
                    pos = Eijiro.pos_dictionary.get(pos, pos)
                else:
                    pos = pos_content.strip()
            else:
                word = header.strip()
                pos = None

            if pos is not None:
                content = self.header_data_delimiter.join((pos, content))

            if prev_word is not None and prev_content is not None:
                # the .lower() here is necessary because sometimes there are sequential entries that go back and forth
                # on case, like "the" -> "The" -> "the". As is, we end up using the case attached to the last entry
                if word.lower() == prev_word.lower():
                    content = self.line_data_delimiter.join((prev_content, content))
                    # if we've ever seen a lowercase form, hold onto it for lookup
                    prev_word = word if word.islower() else prev_word
                else:
                    yield prev_word, prev_content
                    prev_word = word
            else:
                prev_word = word

            prev_content = content

        yield prev_word, prev_content

    def parse_word_content(self, word: Word, form: str, content: str, following_link: bool = False) -> LookupData:
        lines = content.split(self.line_data_delimiter)
        line_parses = []
        for line in lines:
            line_attrs = defaultdict(list)

            if self.header_data_delimiter in line:
                header_data, line = line.split(self.header_data_delimiter)
                # only data in the header seems to be the POS
                line_attrs[Fieldname.PART_OF_SPEECH] = header_data

            content_sections = Eijiro.content_sectioning_regex.split(line)
            if content_sections[0] not in Eijiro.content_sectioning_symbols:
                leading_content = content_sections.pop(0)
                if leading_content:  # leading content is sometimes empty, don't want to add blank definitions
                    line_attrs[Fieldname.DEFINITIONS].append(leading_content)

            section_header = None
            for section in content_sections:
                if section_header is None and section in Eijiro.content_sectioning_symbols:
                    section_header = section
                elif section_header is not None and section not in Eijiro.content_sectioning_symbols:
                    key = content_sectioning_symbol_map[section_header]
                    if key == Fieldname.LINKS:
                        if not following_link:
                            linked_word = section[:section.index('>')]
                            try:
                                line_attrs[Fieldname.LINKS].append(self.lookup_word(word, linked_word,
                                                                                    following_link=True))
                            except WordLookupException:
                                log(self, 'Found link to apparently missing word "{}" in definition of word "{}"'.format(
                                    linked_word, form
                                ), WARNING)
                    else:
                        line_attrs[key].append(section.strip('、'))

                    section_header = None
                else:
                    raise CardBuilderException('Unexpected sectioning sequence in Eijiro dictionary')

            line_parses.append(line_attrs)

        aggregated_parse = defaultdict(lambda: defaultdict(list))
        links = []
        for val_map in line_parses:
            pos = val_map.get(Fieldname.PART_OF_SPEECH, None)
            if Fieldname.LINKS in val_map:
                links.extend(val_map[Fieldname.LINKS])
            for key, val in ((k, v) for k, v in val_map.items() if k != Fieldname.PART_OF_SPEECH
                                                                   and k != Fieldname.LINKS):
                aggregated_parse[key][pos].extend(val)

        output = {}
        if links:
            output[Fieldname.LINKS] = LinksValue(links)
        for val_key, val_dict in aggregated_parse.items():
            output[val_key] = MultiListValue([([val for val in vals if val], pos) for pos, vals in val_dict.items()])

        if Fieldname.LINKS in output:
            for linked_word_dict in output[Fieldname.LINKS].get_data():
                for key, value in linked_word_dict.get_data().items():
                    if (key not in output or not output[key].get_data()) and key in Fieldname.link_friendly_fields():
                        output[key] = value

        return self.lookup_data_type(word, form, content, output)

    def _fetch_remote_files_if_necessary(self):
        pass  # No remote files to fetch, takes an explicit file location

    def __init__(self, eijiro_location: Optional[str] = None):
        if eijiro_location is not None:
            self.file_loc = abspath(eijiro_location)
            Config.set(self.eijiro_conf_value, 'yes')
        else:
            self.file_loc = None
        super().__init__()