Source code for cardbuilder.lookup.en_to_ja.ejdict_hand

import csv
from collections import defaultdict
from os.path import exists
from string import ascii_lowercase
from typing import Tuple, Iterable

import requests

from cardbuilder.common import Fieldname
from cardbuilder.common.util import log, loading_bar
from cardbuilder.exceptions import WordLookupException
from cardbuilder.input.word import Word
from cardbuilder.lookup.data_source import ExternalDataDataSource
from cardbuilder.lookup.lookup_data import outputs, LookupData
from cardbuilder.lookup.value import LinksValue, ListValue


[docs]@outputs({ Fieldname.DEFINITIONS: ListValue, Fieldname.LINKS: LinksValue }) class EJDictHand(ExternalDataDataSource): filename = 'ejdicthand.txt' definition_delim = ' / ' link_symbol = '=' # https://kujirahand.com/web-tools/EJDictFreeDL.php def _fetch_remote_files_if_necessary(self): if not exists(EJDictHand.filename): log(self, '{} not found - downloading and assembling file pieces...'.format(self.filename)) all_content = bytes() for letter in loading_bar(ascii_lowercase, 'downloading EJDict-hand files'): url = 'https://raw.githubusercontent.com/kujirahand/EJDict/master/src/{}.txt'.format(letter) request = requests.get(url) all_content = all_content + request.content with open(self.filename, 'wb+') as f: f.write(all_content) def _read_and_convert_data(self) -> Iterable[Tuple[str, str]]: definition_map = defaultdict(list) with open(self.filename, 'r', encoding='utf-8') as f: reader = csv.reader(f, delimiter='\t') for word_entry, definition in reader: for word in word_entry.split(','): definitions = definition.split(self.definition_delim) definition_map[word].extend(dfn for dfn in definitions) return ((word, self.definition_delim.join(defs)) for word, defs in definition_map.items()) def parse_word_content(self, word: Word, form: str, content: str, following_link: bool = False) -> LookupData: content_items = content.split(self.definition_delim) definitions = [c for c in content_items if not c.startswith(self.link_symbol)] links = [c[1:] for c in content_items if c.startswith(self.link_symbol)] if len(definitions) == 0: if len(links) > 0 and not following_link: first_link = links[0] remaining_links = links[1:] output = self.lookup_word(word, first_link, following_link=True) if len(remaining_links) > 0: output[Fieldname.LINKS] = LinksValue([self.lookup_word(word, linked_word, following_link=True) for linked_word in remaining_links]) else: raise WordLookupException('Empty entry found for word {} in EJDictHand'.format(form)) else: output = self.lookup_data_type(word, form, content, { Fieldname.DEFINITIONS: ListValue(definitions), }) if len(links) > 0 and not following_link: output[Fieldname.LINKS] = LinksValue([self.lookup_word(word, linked_word, following_link=True) for linked_word in links]) return output