Parse all tables and add a term_category to Term - wr - Translate a term via WordReference.com
(HTM) hg clone https://bitbucket.org/iamleot/wr
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
---
(DIR) changeset 395bf3c1b7d4906b8b127e201dde50a8a65aabe0
(DIR) parent c5b5d59472259bd4123d5641cdbb11677f4a9d98
(HTM) Author: Leonardo Taccari <iamleot@gmail.com>
Date: Thu, 21 Nov 2019 16:43:53
Parse all tables and add a term_category to Term
Honor all translations and categories. Previously only the first table was
parsed (`Principal Translations' and `Additional Translations').
Populate the corresponding category to the new term_category field in Term.
Update copyright.
Patch from Rocky Hotas, thanks!
Diffstat:
wr.py | 136 ++++++++++++++++++++++++++++++++++++++++-------------------------
1 files changed, 83 insertions(+), 53 deletions(-)
---
diff -r c5b5d5947225 -r 395bf3c1b7d4 wr.py
--- a/wr.py Wed Oct 30 09:31:06 2019 +0000
+++ b/wr.py Thu Nov 21 16:43:53 2019 +0100
@@ -1,7 +1,7 @@
#!/usr/pkg/bin/python3.7
#
-# Copyright (c) 2019 Leonardo Taccari
+# Copyright (c) 2019 Leonardo Taccari, Rocky Hotas
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@@ -50,11 +50,13 @@
'term',
'term_type',
'term_description',
+ 'term_category',
'translations',
'translation_contexts',
'translation_types',
])
Term.term.__doc__ += 'Term'
+Term.term_category.__doc__ += 'Category of translation'
Term.term_description.__doc__ += 'Description of the term'
Term.term_type.__doc__ += 'Type of the term'
Term.translations.__doc__ += 'List of all translations'
@@ -78,11 +80,13 @@
>>> type(ts)
<class 'list'>
>>> len(ts)
- 3
+ 17
>>> type(ts[0])
<class 'wr.Term'>
>>> ts[0].term
'example'
+ >>> ts[0].term_category
+ 'Principal Translations'
>>> ts[0].term_type
'n'
>>> ts[0].term_description
@@ -93,6 +97,20 @@
['']
>>> ts[0].translation_types
['nm']
+ >>> ts[3].term
+ 'by way of example'
+ >>> ts[3].term_category
+ 'Compound Forms'
+ >>> ts[3].term_type
+ 'adv'
+ >>> ts[3].term_description
+ '(as an example)'
+ >>> ts[3].translations
+ ["a titolo d'esempio", "a mo' di esempio"]
+ >>> ts[3].translation_contexts
+ ['', '']
+ >>> ts[3].translation_types
+ ['avv', 'avv']
"""
req = request.Request(WORDREFERENCE_URL.format(
dictionary=parse.quote(dictionary),
@@ -104,58 +122,70 @@
ts = []
- if content and content.table:
- t = {}
- for tr in content.table.find_all('tr'):
- if 'wrtopsection' in tr.get('class', []) or 'langHeader' in tr.get('class', []):
- continue
-
- if tr.get('id'):
- if t:
- ts.append(Term(**t))
- t = {}
-
- if tr.find('td', class_='FrWrd'):
- frwrd = tr.find('td', class_='FrWrd')
- t['term'] = frwrd.strong.text.strip()
- pos2 = frwrd.find('em', class_='tooltip POS2')
- if pos2 and pos2.children and len(list(pos2.children)) > 0:
- t['term_type'] = list(pos2.children)[0].strip()
- else:
- t['term_type'] = ''
- _, fr2, _ = tr.find_all('td')
- t['term_description'] = fr2.text.strip()
+ if content:
+ categ = ''
+ for table in content.find_all('table', class_='WRD'):
+ t = {}
+ for tr in table.find_all('tr'):
+ if 'langHeader' in tr.get('class', []):
+ continue
+
+ if tr.get('id'):
+ if t:
+ t['term_category'] = categ
+ ts.append(Term(**t))
+ t = {}
- if tr.find('td', class_='ToWrd'):
- towrd = tr.find('td', class_='ToWrd')
- pos2 = towrd.find('em', class_='POS2')
- to2 = tr.find('td', class_='To2')
- if not t.get('translation_contexts'):
- t['translation_contexts'] = []
- if not t.get('translation_types'):
- t['translation_types'] = []
- if to2:
- t['translation_contexts'].append(to2.text.strip())
- else:
- t['translation_contexts'].append('')
- if pos2 and pos2.children and len(list(pos2.children)) > 0:
- t['translation_types'].append(list(pos2.children)[0].strip())
- towrd.find('em', class_='POS2').clear()
- else:
- t['translation_types'].append('')
- if not t.get('translations'):
- t['translations'] = []
- t['translations'].append(towrd.text.strip())
-
- if tr.find('td', class_='FrEx'):
- frex = tr.find('td', class_='FrEx')
-
- if tr.find('td', class_='ToEx'):
- toex = tr.find('td', class_='ToEx')
-
- else:
- if t:
- ts.append(Term(**t))
+ if 'wrtopsection' in tr.get('class', []):
+ if t:
+ t['term_category'] = categ
+ ts.append(Term(**t))
+ t = {}
+ categ = tr.td['title'].strip()
+
+ if tr.find('td', class_='FrWrd'):
+ frwrd = tr.find('td', class_='FrWrd')
+ t['term'] = frwrd.strong.text.strip()
+ pos2 = frwrd.find('em', class_='tooltip POS2')
+ if pos2 and pos2.children and len(list(pos2.children)) > 0:
+ t['term_type'] = list(pos2.children)[0].strip()
+ else:
+ t['term_type'] = ''
+ _, fr2, _ = tr.find_all('td')
+ t['term_description'] = fr2.text.strip()
+
+ if tr.find('td', class_='ToWrd'):
+ towrd = tr.find('td', class_='ToWrd')
+ pos2 = towrd.find('em', class_='POS2')
+ to2 = tr.find('td', class_='To2')
+ if not t.get('translation_contexts'):
+ t['translation_contexts'] = []
+ if not t.get('translation_types'):
+ t['translation_types'] = []
+ if to2:
+ t['translation_contexts'].append(to2.text.strip())
+ else:
+ t['translation_contexts'].append('')
+ if pos2 and pos2.children and len(list(pos2.children)) > 0:
+ t['translation_types'].append(list(pos2.children)[0].strip())
+ towrd.find('em', class_='POS2').clear()
+ else:
+ t['translation_types'].append('')
+ if not t.get('translations'):
+ t['translations'] = []
+ t['translations'].append(towrd.text.strip())
+
+ if tr.find('td', class_='FrEx'):
+ frex = tr.find('td', class_='FrEx')
+
+ if tr.find('td', class_='ToEx'):
+ toex = tr.find('td', class_='ToEx')
+
+ else:
+ if t:
+ t['term_category'] = categ
+ ts.append(Term(**t))
+ t = {}
return ts