Source code for cardbuilder.lookup.en.word_freq

import csv
import sqlite3
from typing import Iterable, Tuple

from cardbuilder.common import Fieldname
from cardbuilder.common.util import fast_linecount, InDataDir, loading_bar, log, DATABASE_NAME, retry_with_logging
from cardbuilder.exceptions import WordLookupException
from cardbuilder.input.word import Word
from cardbuilder.lookup.data_source import ExternalDataDataSource
from cardbuilder.lookup.lookup_data import LookupData, outputs
from cardbuilder.lookup.value import SingleValue


[docs]@outputs({ Fieldname.SUPPLEMENTAL: SingleValue }) class WordFrequency(ExternalDataDataSource): """ Provides English word frequency, taken from Peter Norvig's ngram frequency list (https://norvig.com/ngrams/). """ url = 'http://norvig.com/ngrams/count_1w.txt' filename = 'count_1w.txt' content_type = 'INT' def __init__(self): super(WordFrequency, self).__init__() log(self, 'Loading word frequency data from table...') c = self.conn.execute('''SELECT * FROM {}'''.format(self.default_table)) self.frequency = dict(c.fetchall()) def lookup_word(self, word: Word, form: str, following_link: bool = False) -> LookupData: if form not in self.frequency: raise WordLookupException('No frequency information for {}'.format(form)) content = str(self[form]) return self.lookup_data_type(word, form, content, { Fieldname.SUPPLEMENTAL: SingleValue(content), }) def _read_and_convert_data(self) -> Iterable[Tuple[str, int]]: frequency = {} with InDataDir(): line_count = fast_linecount(self.filename) with open(self.filename, 'r', encoding='utf-8') as f: reader = csv.reader(f, delimiter='\t') for word, freq in loading_bar(reader, 'reading {}'.format(self.filename), line_count): frequency[word] = int(freq) return frequency.items() def parse_word_content(self, word: Word, form: str, content: str, following_link: bool = False) -> LookupData: pass def __getitem__(self, word: str) -> int: return self.frequency.get(word.lower(), 0)