Rewrite the parsing in translate() and adjust Term and print_term() accordingly - wr - Translate a term via WordReference.com
(HTM) hg clone https://bitbucket.org/iamleot/wr
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
---
(DIR) changeset b7eb697e8b8c4c3096ab4c0edd6cadac2a682859
(DIR) parent a9ddda00f212ad36066e7963bf29e84b8bc661f1
(HTM) Author: Leonardo Taccari <iamleot@gmail.com>
Date: Fri, 18 Oct 2019 12:36:05
Rewrite the parsing in translate() and adjust Term and print_term() accordingly
Adjust the parsing to catch all rows instead only the ones with an `id'
attribute. Add initial logic to parse examples.
Multiple rows are still not properly honored (e.g. there could be multiple
translation-s that span over several rows).
Diffstat:
wr.py | 87 +++++++++++++++++++++++++++++++++++++++++++-----------------------
1 files changed, 56 insertions(+), 31 deletions(-)
---
diff -r a9ddda00f212 -r b7eb697e8b8c wr.py
--- a/wr.py Sat Oct 05 13:06:43 2019 +0200
+++ b/wr.py Fri Oct 18 12:36:05 2019 +0200
@@ -48,14 +48,16 @@
Term = collections.namedtuple('Term', [
'term',
- 'type',
+ 'term_type',
'term_description',
'translation',
+ 'translation_type',
])
Term.term.__doc__ += 'Term'
Term.term_description.__doc__ += 'Description of the term'
-Term.type.__doc__ += 'Type'
+Term.term_type.__doc__ += 'Type of the term'
Term.translation.__doc__ += 'Translation'
+Term.translation_type.__doc__ += 'Type of the translation'
def translate(dictionary: str, term: str) -> List[Term]:
@@ -79,12 +81,14 @@
<class 'wr.Term'>
>>> ts[0].term
'example'
- >>> ts[0].type
+ >>> ts[0].term_type
'n'
>>> ts[0].term_description
'(typical instance)'
>>> ts[0].translation
'esempio'
+ >>> ts[0].translation_type
+ 'nm'
"""
req = request.Request(WORDREFERENCE_URL.format(
dictionary=parse.quote(dictionary),
@@ -97,44 +101,65 @@
ts = []
if content and content.table:
- for tr in content.table.find_all('tr', id=True):
- frwrd, fr2, towrd = tr.find_all('td')
- pos2 = frwrd.find('em', class_='tooltip POS2')
- term = frwrd.strong.text.strip()
- if fr2.find('span', class_='dsense'):
- fr2.span.clear()
- term_description = fr2.text.strip()
- if pos2 and pos2.children and len(list(pos2.children)) > 0:
- type = list(pos2.children)[0].strip()
- else:
- type = ''
- if towrd.find('em', class_='POS2'):
- towrd.em.clear()
- translation = towrd.text.strip()
- ts.append(
- Term(
- term=term,
- type=type,
- term_description=term_description,
- translation=translation,
- )
- )
+ t = {}
+ for tr in content.table.find_all('tr'):
+ if 'wrtopsection' in tr.get('class', []) or 'langHeader' in tr.get('class', []):
+ continue
+
+ if tr.get('id'):
+ if t:
+ ts.append(Term(**t))
+ t = {}
+
+ if tr.find('td', class_='FrWrd'):
+ frwrd = tr.find('td', class_='FrWrd')
+ t['term'] = frwrd.strong.text.strip()
+ pos2 = frwrd.find('em', class_='tooltip POS2')
+ if pos2 and pos2.children and len(list(pos2.children)) > 0:
+ t['term_type'] = list(pos2.children)[0].strip()
+ else:
+ t['term_type'] = ''
+ _, fr2, _ = tr.find_all('td')
+ t['term_description'] = fr2.text.strip()
+
+ if tr.find('td', class_='ToWrd'):
+ towrd = tr.find('td', class_='ToWrd')
+ pos2 = towrd.find('em', class_='POS2')
+ if pos2 and pos2.children and len(list(pos2.children)) > 0:
+ t['translation_type'] = list(pos2.children)[0].strip()
+ towrd.find('em', class_='POS2').clear()
+ else:
+ t['translation_type'] = ''
+ t['translation'] = towrd.text.strip()
+
+ if tr.find('td', class_='FrEx'):
+ frex = tr.find('td', class_='FrEx')
+
+ if tr.find('td', class_='ToEx'):
+ toex = tr.find('td', class_='ToEx')
+
+ else:
+ if t:
+ ts.append(Term(**t))
return ts
def print_term(term: Term):
"""Pretty print a Term"""
- if term.type:
- sfmt = '{term} [{type}] {description}:\n{translation}'
- else:
- sfmt = '{term} {description}:\n{translation}'
+ sfmt = '{term} '
+ if term.term_type:
+ sfmt += '[{term_type}] '
+ sfmt += '{description}:\n{translation} '
+ if term.translation_type:
+ sfmt += '[{translation_type}]'
print(textwrap.fill(sfmt.format(
term=term.term,
- type=term.type,
+ term_type=term.term_type,
description=term.term_description,
- translation=term.translation),
+ translation=term.translation,
+ translation_type=term.translation_type).strip(),
width=80,
break_long_words=False,
break_on_hyphens=False))