Revisão | 0fa41a222be5999485ee73d7b3a1d59cf0a18269 (tree) |
---|---|
Hora | 2019-08-08 20:20:52 |
Autor | Dan Villiom Podlaski Christiansen <dan@mage...> |
Commiter | Dan Villiom Podlaski Christiansen |
kill old food
@@ -1,177 +0,0 @@ | ||
1 | -#!/usr/bin/env python3.7 | |
2 | -# | |
3 | -# The employees of Magenta ApS wrote this file. | |
4 | -# | |
5 | -# As long as you retain this notice you can do whatever you want with | |
6 | -# this stuff. If we meet some day, and you think this stuff is worth | |
7 | -# it, you can buy us a beer in return. | |
8 | -# | |
9 | -# Dan Villiom Podlaski Christiansen (channeling Poul-Henning Kamp) | |
10 | -# | |
11 | - | |
12 | -import datetime | |
13 | -import json | |
14 | -import locale | |
15 | -import pathlib | |
16 | -import sys | |
17 | - | |
18 | -import click | |
19 | -import consolemd | |
20 | -import dateparser | |
21 | -import pdf2image | |
22 | -import pyquery | |
23 | -import pytesseract | |
24 | -import regex | |
25 | -import requests | |
26 | - | |
27 | -DEFAULT_URL = 'http://www.firma-catering.dk/frokostordning/ugens-menu' | |
28 | -ICON_URL = ( | |
29 | - 'https://git.magenta.dk/uploads/-/system/project/avatar/36/clipart.png' | |
30 | -) | |
31 | - | |
32 | -USER_AGENT = 'Mozilla/5.0 FoodBot/1337 (+https://git.magenta.dk/dan/food)' | |
33 | - | |
34 | - | |
35 | -def get_pdf_url(weeknum, url): | |
36 | - r = requests.get(url, headers={'User-Agent': USER_AGENT}) | |
37 | - r.raise_for_status() | |
38 | - | |
39 | - doc = pyquery.PyQuery(r.content) | |
40 | - doc.make_links_absolute(url) | |
41 | - | |
42 | - for el in doc.find('a.ugemenu'): | |
43 | - text = ''.join(el.itertext()) | |
44 | - | |
45 | - if text.lower() == f'uge {weeknum}': | |
46 | - return el.get('href') | |
47 | - | |
48 | - return None | |
49 | - | |
50 | - | |
51 | -def get_menu(dt, url): | |
52 | - locale.setlocale(locale.LC_ALL, 'da_DK.UTF-8') | |
53 | - | |
54 | - weeknum = str(int(dt.strftime('%V'))) | |
55 | - weekday = dt.strftime('%A') | |
56 | - month = dt.strftime('%b').rstrip('.') | |
57 | - | |
58 | - topdir = pathlib.Path(__file__).parent | |
59 | - cachedir = topdir / 'cache' | |
60 | - pdfcache = cachedir / f'week_{weeknum}.pdf' | |
61 | - txtcache = cachedir / f'week_{weeknum}.txt' | |
62 | - | |
63 | - spellbugs = json.loads(topdir.joinpath('spellbugs.json').read_text()) | |
64 | - | |
65 | - pretty_date = ( | |
66 | - dt.strftime('%c').replace(dt.strftime('%X '), '').replace(' ', ' ') | |
67 | - ) | |
68 | - | |
69 | - if not pdfcache.exists(): | |
70 | - pdf_url = get_pdf_url(weeknum, url) | |
71 | - | |
72 | - if not pdf_url: | |
73 | - return | |
74 | - | |
75 | - with requests.get( | |
76 | - pdf_url, stream=True, headers={'User-Agent': USER_AGENT} | |
77 | - ) as r: | |
78 | - r.raise_for_status() | |
79 | - | |
80 | - pdfcache.parent.mkdir(parents=True, exist_ok=True) | |
81 | - pdfcache.write_bytes(r.content) | |
82 | - | |
83 | - if not txtcache.exists(): | |
84 | - images = pdf2image.convert_from_path(pdfcache, dpi=450) | |
85 | - | |
86 | - txtcache.write_text( | |
87 | - '\n'.join( | |
88 | - pytesseract.image_to_string( | |
89 | - image, lang='dan', config='--psm 6' | |
90 | - ) | |
91 | - for image in images | |
92 | - ) | |
93 | - ) | |
94 | - | |
95 | - first = True | |
96 | - | |
97 | - for line in txtcache.read_text().splitlines(): | |
98 | - if regex.match(f'^{weekday}', line, flags=regex.IGNORECASE): | |
99 | - if first: | |
100 | - yield f'[Frokostmenuen]({url}) for {pretty_date} er:' | |
101 | - yield '' | |
102 | - | |
103 | - first = False | |
104 | - | |
105 | - # gah | |
106 | - for needle, better_needle in spellbugs.items(): | |
107 | - line = line.replace(needle, better_needle) | |
108 | - | |
109 | - line = regex.sub(r':(?=[^ ])', ': ', line) | |
110 | - line = regex.sub( | |
111 | - f'^{weekday} ', '', line, count=1, flags=regex.IGNORECASE | |
112 | - ) | |
113 | - line = regex.sub( | |
114 | - r'^[\d/]+ ?', '', line, count=1, flags=regex.IGNORECASE | |
115 | - ) | |
116 | - line = regex.sub( | |
117 | - f'^{month} ', '', line, count=1, flags=regex.IGNORECASE | |
118 | - ) | |
119 | - line = regex.sub(f'^— ', '', line, count=1) | |
120 | - | |
121 | - line = regex.sub( | |
122 | - r'^(\p{General_Category=Uppercase_Letter}\w+) ' | |
123 | - r'([\d\p{General_Category=Uppercase_Letter}])', | |
124 | - r'\1: \2', | |
125 | - line, | |
126 | - count=1, | |
127 | - ) | |
128 | - | |
129 | - line = regex.sub(r'^([^:]+:)', r'*\1*', line, count=1) | |
130 | - | |
131 | - yield line | |
132 | - | |
133 | - | |
134 | -@click.command() | |
135 | -@click.pass_context | |
136 | -@click.argument('url', default=DEFAULT_URL) | |
137 | -@click.option('-c', '--channel', default='test') | |
138 | -@click.option('-u', '--json-url') | |
139 | -@click.option('-d', '--date', metavar='DATE') | |
140 | -def main(ctxt, date, url, json_url, channel): | |
141 | - if date is not None: | |
142 | - dt = dateparser.parse( | |
143 | - date, settings={'DATE_ORDER': 'DMY', 'PREFER_DATES_FROM': 'future'} | |
144 | - ) | |
145 | - else: | |
146 | - dt = datetime.datetime.now() | |
147 | - | |
148 | - if dt is None: | |
149 | - click.echo(f'error: not a valid date: {date}', err=True) | |
150 | - raise click.exceptions.Exit(1) | |
151 | - | |
152 | - menu = '\n'.join(get_menu(dt, url)) | |
153 | - | |
154 | - if not menu: | |
155 | - click.echo(f'error: no menu for {dt.date()}!', err=True) | |
156 | - raise click.exceptions.Exit(1) | |
157 | - elif not json_url: | |
158 | - if sys.stdout.isatty(): | |
159 | - menu = consolemd.Renderer().render(menu) | |
160 | - | |
161 | - click.echo(menu) | |
162 | - else: | |
163 | - r = requests.post( | |
164 | - json_url, | |
165 | - json={ | |
166 | - 'text': menu, | |
167 | - 'channel': f'#{channel}', | |
168 | - 'from': 'Madbåtten', | |
169 | - 'icon_url': ICON_URL, | |
170 | - }, | |
171 | - ) | |
172 | - | |
173 | - r.raise_for_status() | |
174 | - | |
175 | - | |
176 | -if __name__ == '__main__': | |
177 | - main() |