Source code for cardbuilder.lookup.en_to_ja.gene_dict

import tarfile
from json import dumps, loads
from os.path import exists
from typing import Iterable, Tuple

from cardbuilder.common import Fieldname
from cardbuilder.common.util import log, download_to_stream_with_loading_bar
from cardbuilder.input.word import Word
from cardbuilder.lookup.data_source import ExternalDataDataSource
from cardbuilder.lookup.lookup_data import outputs, LookupData
from cardbuilder.lookup.value import SingleValue


[docs]@outputs({ Fieldname.DEFINITIONS: SingleValue, Fieldname.SUPPLEMENTAL: SingleValue, Fieldname.EXAMPLE_SENTENCES: SingleValue }) class GeneDict(ExternalDataDataSource): """The DataSource for the GENE95 dictionary (http://www.namazu.org/~tsuchiya/sdic/data/gene.html) Definitions are returned as a single line of text, because while definitions are usually delimited with a comma, this does not hold true all the time.""" supplemental_data_delim = ' ' example_sentence_delim = ' / ' expected_first_element = '!' filename = 'gene_dict.txt' url = 'http://www.namazu.org/~tsuchiya/sdic/data/gene95.tar.gz' def _read_and_convert_data(self) -> Iterable[Tuple[str, str]]: definitions = {} supplemental = {} examples = {} found_first_valid_line = False reading_word = True for line in open(self.filename, encoding='utf-8'): if not found_first_valid_line: if line[0] == self.expected_first_element: found_first_valid_line = True else: continue if reading_word: word_line_content = line.split(self.supplemental_data_delim) word = word_line_content[0].strip() if len(word_line_content) > 1: supplemental[word] = word_line_content[1].strip() reading_word = False else: # we're reading a definition line definition_line_list = line.strip().split(self.example_sentence_delim) definitions[word] = definition_line_list[0] if len(definition_line_list) > 1: examples[word] = definition_line_list[1] reading_word = True for word in definitions: data = {Fieldname.DEFINITIONS: definitions[word]} if word in supplemental: data[Fieldname.SUPPLEMENTAL] = supplemental[word] if word in examples: data[Fieldname.EXAMPLE_SENTENCES] = examples[word] yield word, dumps({key.name: val for key, val in data.items()}) def parse_word_content(self, word: Word, form: str, content: str, following_link: bool = False) -> LookupData: return self.lookup_data_type(word, form, content, { Fieldname[key]: SingleValue(val) for key, val in loads(content).items() }) def _fetch_remote_files_if_necessary(self): if not exists(self.filename): log(self, '{} not found - downloading and extracting...'.format(self.filename)) stream = download_to_stream_with_loading_bar(self.url) tar = tarfile.open(fileobj=stream, mode='r:gz') gene_data = tar.extractfile('gene.txt').read().decode('shift_jisx0213') with open(self.filename, 'w+', encoding='utf-8') as f: f.write(gene_data)