wr.py - wr - Translate a term via WordReference.com
(HTM) hg clone https://bitbucket.org/iamleot/wr
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
---
wr.py
---
1 #!/usr/pkg/bin/python3.7
2
3 #
4 # Copyright (c) 2019 Leonardo Taccari, Rocky Hotas
5 # All rights reserved.
6 #
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions
9 # are met:
10 #
11 # 1. Redistributions of source code must retain the above copyright
12 # notice, this list of conditions and the following disclaimer.
13 # 2. Redistributions in binary form must reproduce the above copyright
14 # notice, this list of conditions and the following disclaimer in the
15 # documentation and/or other materials provided with the distribution.
16 #
17 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19 # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
21 # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 # POSSIBILITY OF SUCH DAMAGE.
28 #
29
30
31 """
32 Translate a term via WordReference.com
33
34 wr is a script/module to translate terms via WordReference.com.
35 """
36
37
38 from bs4 import BeautifulSoup, SoupStrainer
39 from typing import List
40 from urllib import parse, request
41 import collections
42 import textwrap
43
44
45 WORDREFERENCE_URL = 'https://www.wordreference.com/{dictionary}/{term}'
46 WORDREFERENCE_USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0'
47
48
49 Term = collections.namedtuple('Term', [
50 'term',
51 'term_type',
52 'term_description',
53 'term_category',
54 'translations',
55 'translation_contexts',
56 'translation_types',
57 ])
58 Term.term.__doc__ += 'Term'
59 Term.term_category.__doc__ += 'Category of translation'
60 Term.term_description.__doc__ += 'Description of the term'
61 Term.term_type.__doc__ += 'Type of the term'
62 Term.translations.__doc__ += 'List of all translations'
63 Term.translation_contexts.__doc__ += 'List of contexts of all the translations'
64 Term.translation_types.__doc__ += 'List of types of all the translations'
65
66
67 def translate(dictionary: str, term: str) -> List[Term]:
68 """Translate a term scraping WordReference.com
69
70 Given a pair of dictionaries (e.g. `enit') via `dictionary',
71 translate the term `term' and return all translation as a list of Term-s.
72
73 Supported dictionaries are: `ar' (Arabic), `cz' (Czech), `de' (German),
74 `en' (English), `es' (Spanish), `fr' (French), `gr' (Greek),
75 `it' (Italian), `ja' (Japanese), `ko' (Korean), `nl' (Dutch),
76 `pl' (Polish), `pt' (Portuguese), `ro' (Romanian), `ru' (Russian),
77 `sv' (Swedish), `tr' (Turkish), `zh' (Chinese).
78
79 >>> ts = translate('enit', 'example')
80 >>> type(ts)
81 <class 'list'>
82 >>> len(ts)
83 17
84 >>> type(ts[0])
85 <class 'wr.Term'>
86 >>> ts[0].term
87 'example'
88 >>> ts[0].term_category
89 'Principal Translations'
90 >>> ts[0].term_type
91 'n'
92 >>> ts[0].term_description
93 '(typical instance)'
94 >>> ts[0].translations
95 ['esempio']
96 >>> ts[0].translation_contexts
97 ['']
98 >>> ts[0].translation_types
99 ['nm']
100 >>> ts[3].term
101 'by way of example'
102 >>> ts[3].term_category
103 'Compound Forms'
104 >>> ts[3].term_type
105 'adv'
106 >>> ts[3].term_description
107 '(as an example)'
108 >>> ts[3].translations
109 ["a titolo d'esempio", "a mo' di esempio"]
110 >>> ts[3].translation_contexts
111 ['', '']
112 >>> ts[3].translation_types
113 ['avv', 'avv']
114 """
115 req = request.Request(WORDREFERENCE_URL.format(
116 dictionary=parse.quote(dictionary),
117 term=parse.quote(term)))
118 req.add_header('User-Agent', WORDREFERENCE_USER_AGENT)
119 with request.urlopen(req) as r:
120 content = BeautifulSoup(r, 'html.parser',
121 parse_only=SoupStrainer(id='articleWRD'))
122
123 ts = []
124
125 if content:
126 categ = ''
127 for table in content.find_all('table', class_='WRD'):
128 t = {}
129 for tr in table.find_all('tr'):
130 if 'langHeader' in tr.get('class', []):
131 continue
132
133 if tr.get('id'):
134 if t:
135 t['term_category'] = categ
136 ts.append(Term(**t))
137 t = {}
138
139 if 'wrtopsection' in tr.get('class', []):
140 if t:
141 t['term_category'] = categ
142 ts.append(Term(**t))
143 t = {}
144 categ = tr.td['title'].strip()
145
146 if tr.find('td', class_='FrWrd'):
147 frwrd = tr.find('td', class_='FrWrd')
148 [a.decompose() for a in frwrd.find_all('a', text='⇒')]
149 t['term'] = frwrd.strong.text.strip()
150 pos2 = frwrd.find('em', class_='tooltip POS2')
151 if pos2 and pos2.children and len(list(pos2.children)) > 0:
152 t['term_type'] = list(pos2.children)[0].strip()
153 else:
154 t['term_type'] = ''
155 _, fr2, _ = tr.find_all('td')
156 t['term_description'] = fr2.text.strip()
157
158 if tr.find('td', class_='ToWrd'):
159 towrd = tr.find('td', class_='ToWrd')
160 [a.decompose() for a in towrd.find_all('a', text='⇒')]
161 pos2 = towrd.find('em', class_='POS2')
162 to2 = tr.find('td', class_='To2')
163 if not t.get('translation_contexts'):
164 t['translation_contexts'] = []
165 if not t.get('translation_types'):
166 t['translation_types'] = []
167 if to2:
168 t['translation_contexts'].append(to2.text.strip())
169 else:
170 t['translation_contexts'].append('')
171 if pos2 and pos2.children and len(list(pos2.children)) > 0:
172 t['translation_types'].append(list(pos2.children)[0].strip())
173 towrd.find('em', class_='POS2').clear()
174 else:
175 t['translation_types'].append('')
176 if not t.get('translations'):
177 t['translations'] = []
178 t['translations'].append(towrd.text.strip())
179
180 if tr.find('td', class_='FrEx'):
181 frex = tr.find('td', class_='FrEx')
182
183 if tr.find('td', class_='ToEx'):
184 toex = tr.find('td', class_='ToEx')
185
186 else:
187 if t:
188 t['term_category'] = categ
189 ts.append(Term(**t))
190 t = {}
191
192 return ts
193
194
195 def print_term(term: Term, ansi_escape: bool = False):
196 """Pretty print a Term"""
197 print('{term} {term_type}'.format(term=bold(term.term, ansi_escape),
198 term_type=underline(term.term_type, ansi_escape)))
199 print('{description}'.format(description=term.term_description))
200 for translation, translation_context, translation_type in zip(term.translations, term.translation_contexts, term.translation_types):
201 if translation_context:
202 print('{context}'.format(context=translation_context))
203 print(textwrap.fill('{translation} {translation_type}'.format(
204 translation=translation,
205 translation_type=underline(translation_type, ansi_escape)),
206 initial_indent=' ' * 8,
207 subsequent_indent=' ' * 12,
208 width=80,
209 break_long_words=False,
210 break_on_hyphens=False))
211
212
213 def underline(text: str, ansi_escape: bool = False) -> str:
214 """Underline via ANSI escape characters text
215
216 Given `text', underline it via ANSI escape characters if `ansi_escape' is
217 True, otherwise return it unmodified.
218 """
219 if ansi_escape:
220 return '\033[4m{text}\033[0m'.format(text=text)
221 else:
222 return text
223
224
225 def bold(text: str, ansi_escape: bool = False) -> str:
226 """Bold via ANSI escape characters text
227
228 Given `text', bold it via ANSI escape characters if `ansi_escape' is
229 True, otherwise return it unmodified.
230 """
231 if ansi_escape:
232 return '\033[1m{text}\033[0m'.format(text=text)
233 else:
234 return text
235
236
237 if __name__ == '__main__':
238 import argparse
239
240 parser = argparse.ArgumentParser(description="WordReference cli interface")
241 parser.add_argument("dictionary", type=str, help="specify the dictionaries to be used: e.g., enit")
242 parser.add_argument("term", type=str, help="specify the term to be translated")
243 parser.add_argument("-a", action="store_true", help="show all the translation categories", default=False)
244 parser.add_argument("-e", action="store_true", help="format text via ANSI escape characters", default=False)
245 args = parser.parse_args()
246
247 mcats = ('Principal Translations', 'Additional Translations')
248
249 pcat = ''
250 for i, t in enumerate(translate(args.dictionary, args.term)):
251 if not args.a and t.term_category not in mcats:
252 continue
253 if i > 0:
254 print()
255 if t.term_category != pcat:
256 pcat = t.term_category
257 if i > 0:
258 print()
259 print(bold(pcat, args.e))
260 print()
261 print_term(t, args.e)