tccr.it

       wr.py - wr - Translate a term via WordReference.com
 (HTM) hg clone https://bitbucket.org/iamleot/wr
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
       ---
       wr.py
       ---
            1 #!/usr/pkg/bin/python3.7
            2 
            3 #
            4 # Copyright (c) 2019 Leonardo Taccari, Rocky Hotas
            5 # All rights reserved.
            6 #
            7 # Redistribution and use in source and binary forms, with or without
            8 # modification, are permitted provided that the following conditions
            9 # are met:
           10 #
           11 # 1. Redistributions of source code must retain the above copyright
           12 #    notice, this list of conditions and the following disclaimer.
           13 # 2. Redistributions in binary form must reproduce the above copyright
           14 #    notice, this list of conditions and the following disclaimer in the
           15 #    documentation and/or other materials provided with the distribution.
           16 #
           17 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
           18 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
           19 # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
           20 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
           21 # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
           22 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
           23 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
           24 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
           25 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
           26 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
           27 # POSSIBILITY OF SUCH DAMAGE.
           28 #
           29 
           30 
           31 """
           32 Translate a term via WordReference.com
           33 
           34 wr is a script/module to translate terms via WordReference.com.
           35 """
           36 
           37 
           38 from bs4 import BeautifulSoup, SoupStrainer
           39 from typing import List
           40 from urllib import parse, request
           41 import collections
           42 import textwrap
           43 
           44 
           45 WORDREFERENCE_URL = 'https://www.wordreference.com/{dictionary}/{term}'
           46 WORDREFERENCE_USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0'
           47 
           48 
           49 Term = collections.namedtuple('Term', [
           50     'term',
           51     'term_type',
           52     'term_description',
           53     'term_category',
           54     'translations',
           55     'translation_contexts',
           56     'translation_types',
           57 ])
           58 Term.term.__doc__ += 'Term'
           59 Term.term_category.__doc__ += 'Category of translation'
           60 Term.term_description.__doc__ += 'Description of the term'
           61 Term.term_type.__doc__ += 'Type of the term'
           62 Term.translations.__doc__ += 'List of all translations'
           63 Term.translation_contexts.__doc__ += 'List of contexts of all the translations'
           64 Term.translation_types.__doc__ += 'List of types of all the translations'
           65 
           66 
           67 def translate(dictionary: str, term: str) -> List[Term]:
           68     """Translate a term scraping WordReference.com
           69 
           70     Given a pair of dictionaries (e.g. `enit') via `dictionary',
           71     translate the term `term' and return all translation as a list of Term-s.
           72 
           73     Supported dictionaries are: `ar' (Arabic), `cz' (Czech), `de' (German),
           74     `en' (English), `es' (Spanish), `fr' (French), `gr' (Greek),
           75     `it' (Italian), `ja' (Japanese), `ko' (Korean), `nl' (Dutch),
           76     `pl' (Polish), `pt' (Portuguese), `ro' (Romanian), `ru' (Russian),
           77     `sv' (Swedish), `tr' (Turkish), `zh' (Chinese).
           78 
           79     >>> ts = translate('enit', 'example')
           80     >>> type(ts)
           81     <class 'list'>
           82     >>> len(ts)
           83     17
           84     >>> type(ts[0])
           85     <class 'wr.Term'>
           86     >>> ts[0].term
           87     'example'
           88     >>> ts[0].term_category
           89     'Principal Translations'
           90     >>> ts[0].term_type
           91     'n'
           92     >>> ts[0].term_description
           93     '(typical instance)'
           94     >>> ts[0].translations
           95     ['esempio']
           96     >>> ts[0].translation_contexts
           97     ['']
           98     >>> ts[0].translation_types
           99     ['nm']
          100     >>> ts[3].term
          101     'by way of example'
          102     >>> ts[3].term_category
          103     'Compound Forms'
          104     >>> ts[3].term_type
          105     'adv'
          106     >>> ts[3].term_description
          107     '(as an example)'
          108     >>> ts[3].translations
          109     ["a titolo d'esempio", "a mo' di esempio"]
          110     >>> ts[3].translation_contexts
          111     ['', '']
          112     >>> ts[3].translation_types
          113     ['avv', 'avv']
          114     """
          115     req = request.Request(WORDREFERENCE_URL.format(
          116                               dictionary=parse.quote(dictionary),
          117                               term=parse.quote(term)))
          118     req.add_header('User-Agent', WORDREFERENCE_USER_AGENT)
          119     with request.urlopen(req) as r:
          120         content = BeautifulSoup(r, 'html.parser',
          121                                 parse_only=SoupStrainer(id='articleWRD'))
          122 
          123     ts = []
          124 
          125     if content:
          126         categ = ''
          127         for table in content.find_all('table', class_='WRD'):
          128             t = {}
          129             for tr in table.find_all('tr'):
          130                 if 'langHeader' in tr.get('class', []):
          131                     continue
          132     
          133                 if tr.get('id'):
          134                     if t:
          135                         t['term_category'] = categ
          136                         ts.append(Term(**t))
          137                         t = {}
          138 
          139                 if 'wrtopsection' in tr.get('class', []):
          140                     if t:
          141                         t['term_category'] = categ
          142                         ts.append(Term(**t))
          143                         t = {}
          144                     categ = tr.td['title'].strip()
          145     
          146                 if tr.find('td', class_='FrWrd'):
          147                     frwrd = tr.find('td', class_='FrWrd')
          148                     [a.decompose() for a in frwrd.find_all('a', text='⇒')]
          149                     t['term'] = frwrd.strong.text.strip()
          150                     pos2 = frwrd.find('em', class_='tooltip POS2')
          151                     if pos2 and pos2.children and len(list(pos2.children)) > 0:
          152                         t['term_type'] = list(pos2.children)[0].strip()
          153                     else:
          154                         t['term_type'] = ''
          155                     _, fr2, _ = tr.find_all('td')
          156                     t['term_description'] = fr2.text.strip()
          157     
          158                 if tr.find('td', class_='ToWrd'):
          159                     towrd = tr.find('td', class_='ToWrd')
          160                     [a.decompose() for a in towrd.find_all('a', text='⇒')]
          161                     pos2 = towrd.find('em', class_='POS2')
          162                     to2 = tr.find('td', class_='To2')
          163                     if not t.get('translation_contexts'):
          164                         t['translation_contexts'] = []
          165                     if not t.get('translation_types'):
          166                         t['translation_types'] = []
          167                     if to2:
          168                         t['translation_contexts'].append(to2.text.strip())
          169                     else:
          170                         t['translation_contexts'].append('')
          171                     if pos2 and pos2.children and len(list(pos2.children)) > 0:
          172                         t['translation_types'].append(list(pos2.children)[0].strip())
          173                         towrd.find('em', class_='POS2').clear()
          174                     else:
          175                         t['translation_types'].append('')
          176                     if not t.get('translations'):
          177                         t['translations'] = []
          178                     t['translations'].append(towrd.text.strip())
          179     
          180                 if tr.find('td', class_='FrEx'):
          181                     frex = tr.find('td', class_='FrEx')
          182     
          183                 if tr.find('td', class_='ToEx'):
          184                     toex = tr.find('td', class_='ToEx')
          185     
          186             else:
          187                 if t:
          188                     t['term_category'] = categ
          189                     ts.append(Term(**t))
          190                     t = {}
          191 
          192     return ts
          193 
          194 
          195 def print_term(term: Term, ansi_escape: bool = False):
          196     """Pretty print a Term"""
          197     print('{term} {term_type}'.format(term=bold(term.term, ansi_escape),
          198                                       term_type=underline(term.term_type, ansi_escape)))
          199     print('{description}'.format(description=term.term_description))
          200     for translation, translation_context, translation_type in zip(term.translations, term.translation_contexts, term.translation_types):
          201         if translation_context:
          202             print('{context}'.format(context=translation_context))
          203         print(textwrap.fill('{translation} {translation_type}'.format(
          204                             translation=translation,
          205                             translation_type=underline(translation_type, ansi_escape)),
          206               initial_indent=' ' * 8,
          207               subsequent_indent=' ' * 12,
          208               width=80,
          209               break_long_words=False,
          210               break_on_hyphens=False))
          211 
          212 
          213 def underline(text: str, ansi_escape: bool = False) -> str:
          214     """Underline via ANSI escape characters text
          215 
          216     Given `text', underline it via ANSI escape characters if `ansi_escape' is
          217     True, otherwise return it unmodified.
          218     """
          219     if ansi_escape:
          220         return '\033[4m{text}\033[0m'.format(text=text)
          221     else:
          222         return text
          223 
          224 
          225 def bold(text: str, ansi_escape: bool = False) -> str:
          226     """Bold via ANSI escape characters text
          227 
          228     Given `text', bold it via ANSI escape characters if `ansi_escape' is
          229     True, otherwise return it unmodified.
          230     """
          231     if ansi_escape:
          232         return '\033[1m{text}\033[0m'.format(text=text)
          233     else:
          234         return text
          235 
          236 
          237 if __name__ == '__main__':
          238     import argparse
          239 
          240     parser = argparse.ArgumentParser(description="WordReference cli interface")
          241     parser.add_argument("dictionary", type=str, help="specify the dictionaries to be used: e.g., enit")
          242     parser.add_argument("term", type=str, help="specify the term to be translated")
          243     parser.add_argument("-a", action="store_true", help="show all the translation categories", default=False)
          244     parser.add_argument("-e", action="store_true", help="format text via ANSI escape characters", default=False)
          245     args = parser.parse_args()
          246 
          247     mcats = ('Principal Translations', 'Additional Translations')
          248 
          249     pcat = ''
          250     for i, t in enumerate(translate(args.dictionary, args.term)):
          251         if not args.a and t.term_category not in mcats:
          252             continue
          253         if i > 0:
          254             print()
          255         if t.term_category != pcat:
          256             pcat = t.term_category
          257             if i > 0:
          258                 print()
          259             print(bold(pcat, args.e))
          260             print()
          261         print_term(t, args.e)