tccr.it

       Parse all tables and add a term_category to Term - wr - Translate a term via WordReference.com
 (HTM) hg clone https://bitbucket.org/iamleot/wr
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
       ---
 (DIR) changeset 395bf3c1b7d4906b8b127e201dde50a8a65aabe0
 (DIR) parent c5b5d59472259bd4123d5641cdbb11677f4a9d98
 (HTM) Author: Leonardo Taccari <iamleot@gmail.com>
       Date:   Thu, 21 Nov 2019 16:43:53 
       
       Parse all tables and add a term_category to Term
       
       Honor all translations and categories.  Previously only the first table was
       parsed (`Principal Translations' and `Additional Translations').
       
       Populate the corresponding category to the new term_category field in Term.
       
       Update copyright.
       
       Patch from Rocky Hotas, thanks!
       
       Diffstat:
        wr.py |  136 ++++++++++++++++++++++++++++++++++++++++-------------------------
        1 files changed, 83 insertions(+), 53 deletions(-)
       ---
       diff -r c5b5d5947225 -r 395bf3c1b7d4 wr.py
       --- a/wr.py     Wed Oct 30 09:31:06 2019 +0000
       +++ b/wr.py     Thu Nov 21 16:43:53 2019 +0100
       @@ -1,7 +1,7 @@
        #!/usr/pkg/bin/python3.7
        
        #
       -# Copyright (c) 2019 Leonardo Taccari
       +# Copyright (c) 2019 Leonardo Taccari, Rocky Hotas
        # All rights reserved.
        #
        # Redistribution and use in source and binary forms, with or without
       @@ -50,11 +50,13 @@
            'term',
            'term_type',
            'term_description',
       +    'term_category',
            'translations',
            'translation_contexts',
            'translation_types',
        ])
        Term.term.__doc__ += 'Term'
       +Term.term_category.__doc__ += 'Category of translation'
        Term.term_description.__doc__ += 'Description of the term'
        Term.term_type.__doc__ += 'Type of the term'
        Term.translations.__doc__ += 'List of all translations'
       @@ -78,11 +80,13 @@
            >>> type(ts)
            <class 'list'>
            >>> len(ts)
       -    3
       +    17
            >>> type(ts[0])
            <class 'wr.Term'>
            >>> ts[0].term
            'example'
       +    >>> ts[0].term_category
       +    'Principal Translations'
            >>> ts[0].term_type
            'n'
            >>> ts[0].term_description
       @@ -93,6 +97,20 @@
            ['']
            >>> ts[0].translation_types
            ['nm']
       +    >>> ts[3].term
       +    'by way of example'
       +    >>> ts[3].term_category
       +    'Compound Forms'
       +    >>> ts[3].term_type
       +    'adv'
       +    >>> ts[3].term_description
       +    '(as an example)'
       +    >>> ts[3].translations
       +    ["a titolo d'esempio", "a mo' di esempio"]
       +    >>> ts[3].translation_contexts
       +    ['', '']
       +    >>> ts[3].translation_types
       +    ['avv', 'avv']
            """
            req = request.Request(WORDREFERENCE_URL.format(
                                      dictionary=parse.quote(dictionary),
       @@ -104,58 +122,70 @@
        
            ts = []
        
       -    if content and content.table:
       -        t = {}
       -        for tr in content.table.find_all('tr'):
       -            if 'wrtopsection' in tr.get('class', []) or 'langHeader' in tr.get('class', []):
       -                continue
       -
       -            if tr.get('id'):
       -                if t:
       -                    ts.append(Term(**t))
       -                t = {}
       -
       -            if tr.find('td', class_='FrWrd'):
       -                frwrd = tr.find('td', class_='FrWrd')
       -                t['term'] = frwrd.strong.text.strip()
       -                pos2 = frwrd.find('em', class_='tooltip POS2')
       -                if pos2 and pos2.children and len(list(pos2.children)) > 0:
       -                    t['term_type'] = list(pos2.children)[0].strip()
       -                else:
       -                    t['term_type'] = ''
       -                _, fr2, _ = tr.find_all('td')
       -                t['term_description'] = fr2.text.strip()
       +    if content:
       +        categ = ''
       +        for table in content.find_all('table', class_='WRD'):
       +            t = {}
       +            for tr in table.find_all('tr'):
       +                if 'langHeader' in tr.get('class', []):
       +                    continue
       +    
       +                if tr.get('id'):
       +                    if t:
       +                        t['term_category'] = categ
       +                        ts.append(Term(**t))
       +                        t = {}
        
       -            if tr.find('td', class_='ToWrd'):
       -                towrd = tr.find('td', class_='ToWrd')
       -                pos2 = towrd.find('em', class_='POS2')
       -                to2 = tr.find('td', class_='To2')
       -                if not t.get('translation_contexts'):
       -                    t['translation_contexts'] = []
       -                if not t.get('translation_types'):
       -                    t['translation_types'] = []
       -                if to2:
       -                    t['translation_contexts'].append(to2.text.strip())
       -                else:
       -                    t['translation_contexts'].append('')
       -                if pos2 and pos2.children and len(list(pos2.children)) > 0:
       -                    t['translation_types'].append(list(pos2.children)[0].strip())
       -                    towrd.find('em', class_='POS2').clear()
       -                else:
       -                    t['translation_types'].append('')
       -                if not t.get('translations'):
       -                    t['translations'] = []
       -                t['translations'].append(towrd.text.strip())
       -
       -            if tr.find('td', class_='FrEx'):
       -                frex = tr.find('td', class_='FrEx')
       -
       -            if tr.find('td', class_='ToEx'):
       -                toex = tr.find('td', class_='ToEx')
       -
       -        else:
       -            if t:
       -                ts.append(Term(**t))
       +                if 'wrtopsection' in tr.get('class', []):
       +                    if t:
       +                        t['term_category'] = categ
       +                        ts.append(Term(**t))
       +                        t = {}
       +                    categ = tr.td['title'].strip()
       +    
       +                if tr.find('td', class_='FrWrd'):
       +                    frwrd = tr.find('td', class_='FrWrd')
       +                    t['term'] = frwrd.strong.text.strip()
       +                    pos2 = frwrd.find('em', class_='tooltip POS2')
       +                    if pos2 and pos2.children and len(list(pos2.children)) > 0:
       +                        t['term_type'] = list(pos2.children)[0].strip()
       +                    else:
       +                        t['term_type'] = ''
       +                    _, fr2, _ = tr.find_all('td')
       +                    t['term_description'] = fr2.text.strip()
       +    
       +                if tr.find('td', class_='ToWrd'):
       +                    towrd = tr.find('td', class_='ToWrd')
       +                    pos2 = towrd.find('em', class_='POS2')
       +                    to2 = tr.find('td', class_='To2')
       +                    if not t.get('translation_contexts'):
       +                        t['translation_contexts'] = []
       +                    if not t.get('translation_types'):
       +                        t['translation_types'] = []
       +                    if to2:
       +                        t['translation_contexts'].append(to2.text.strip())
       +                    else:
       +                        t['translation_contexts'].append('')
       +                    if pos2 and pos2.children and len(list(pos2.children)) > 0:
       +                        t['translation_types'].append(list(pos2.children)[0].strip())
       +                        towrd.find('em', class_='POS2').clear()
       +                    else:
       +                        t['translation_types'].append('')
       +                    if not t.get('translations'):
       +                        t['translations'] = []
       +                    t['translations'].append(towrd.text.strip())
       +    
       +                if tr.find('td', class_='FrEx'):
       +                    frex = tr.find('td', class_='FrEx')
       +    
       +                if tr.find('td', class_='ToEx'):
       +                    toex = tr.find('td', class_='ToEx')
       +    
       +            else:
       +                if t:
       +                    t['term_category'] = categ
       +                    ts.append(Term(**t))
       +                    t = {}
        
            return ts