Source code for cardbuilder.lookup.en_to_ja.gene_dict

import tarfile
from json import dumps, loads
from os.path import exists
from typing import Iterable, Tuple

from cardbuilder.common import Fieldname
from cardbuilder.common.util import log, download_to_stream_with_loading_bar
from cardbuilder.input.word import Word
from cardbuilder.lookup.data_source import ExternalDataDataSource
from cardbuilder.lookup.lookup_data import outputs, LookupData
from cardbuilder.lookup.value import SingleValue


[docs]@outputs({
    Fieldname.DEFINITIONS: SingleValue,
    Fieldname.SUPPLEMENTAL: SingleValue,
    Fieldname.EXAMPLE_SENTENCES: SingleValue
})
class GeneDict(ExternalDataDataSource):
    """The DataSource for the GENE95 dictionary (http://www.namazu.org/~tsuchiya/sdic/data/gene.html)
    Definitions are returned as a single line of text, because while definitions are usually delimited with a comma,
    this does not hold true all the time."""

    supplemental_data_delim = '     '
    example_sentence_delim = ' / '
    expected_first_element = '!'
    filename = 'gene_dict.txt'
    url = 'http://www.namazu.org/~tsuchiya/sdic/data/gene95.tar.gz'

    def _read_and_convert_data(self) -> Iterable[Tuple[str, str]]:
        definitions = {}
        supplemental = {}
        examples = {}

        found_first_valid_line = False
        reading_word = True
        for line in open(self.filename, encoding='utf-8'):
            if not found_first_valid_line:
                if line[0] == self.expected_first_element:
                    found_first_valid_line = True
                else:
                    continue

            if reading_word:
                word_line_content = line.split(self.supplemental_data_delim)
                word = word_line_content[0].strip()
                if len(word_line_content) > 1:
                    supplemental[word] = word_line_content[1].strip()
                reading_word = False
            else:  # we're reading a definition line
                definition_line_list = line.strip().split(self.example_sentence_delim)
                definitions[word] = definition_line_list[0]
                if len(definition_line_list) > 1:
                    examples[word] = definition_line_list[1]
                reading_word = True

        for word in definitions:
            data = {Fieldname.DEFINITIONS: definitions[word]}
            if word in supplemental:
                data[Fieldname.SUPPLEMENTAL] = supplemental[word]
            if word in examples:
                data[Fieldname.EXAMPLE_SENTENCES] = examples[word]
            yield word, dumps({key.name: val for key, val in data.items()})

    def parse_word_content(self, word: Word, form: str, content: str, following_link: bool = False) -> LookupData:
        return self.lookup_data_type(word, form, content, {
            Fieldname[key]: SingleValue(val) for key, val in loads(content).items()
        })

    def _fetch_remote_files_if_necessary(self):
        if not exists(self.filename):
            log(self, '{} not found - downloading and extracting...'.format(self.filename))
            stream = download_to_stream_with_loading_bar(self.url)
            tar = tarfile.open(fileobj=stream, mode='r:gz')
            gene_data = tar.extractfile('gene.txt').read().decode('shift_jisx0213')
            with open(self.filename, 'w+', encoding='utf-8') as f:
                f.write(gene_data)