Keep crawling on transient connection failure, retry failed URLs once
Simon Volpert
5 years ago
56 | 56 | def save_cache(): |
57 | 57 | if args.dry_run: |
58 | 58 | return |
59 | for _file in [urls.done, urls.new, users.done, users.new, blacklist]: | |
59 | for _file in [urls.done, urls.new, urls.failed, users.done, users.new, blacklist]: | |
60 | 60 | try: |
61 | 61 | _file.path.write_text('\n'.join(_file) + '\n') |
62 | 62 | except (IOError, OSError) as exc: |
142 | 142 | verbose('Skipping {}'.format(url)) |
143 | 143 | return |
144 | 144 | # TODO: Add special clause for "done" URL storage |
145 | if url in urls.done: | |
146 | urls.done.remove(url) | |
147 | urls.done.append(url) | |
148 | 145 | # If the URL is a profile URL, mark it as seen |
149 | 146 | if profile_regex.fullmatch(url): |
150 | 147 | if url in users.done: |
156 | 153 | stderr('Loading {}'.format(url)) |
157 | 154 | try: |
158 | 155 | page = load_url(url) |
159 | except OSError: | |
160 | # TODO: Move into a "failed" queue instead and try again once on next run | |
161 | stderr('Connection timed out') | |
162 | stderr('If this happens repeatedly, try skipping this URL with') | |
163 | stderr(' fediscover crawl --skip') | |
164 | raise SystemExit | |
156 | except OSError as exc: | |
157 | stderr('Error: {}'.format(exc)) | |
158 | if url in urls.failed: | |
159 | stderr('Skipping {}'.format(url)) | |
160 | urls.done.append(url) | |
161 | else: | |
162 | urls.failed.append(url) | |
163 | urls.new.append(url) | |
164 | continue | |
165 | # Process page and mark URL as complete | |
165 | 166 | scrape_page(page, url) |
167 | if url in urls.done: | |
168 | urls.done.remove(url) | |
169 | urls.done.append(url) | |
166 | 170 | |
167 | 171 | |
168 | 172 | # Open the working cache files |
169 | 173 | class urls(object): |
170 | 174 | done = load_file('urls.done') |
171 | 175 | new = load_file('urls.new') |
176 | failed = load_file('urls.failed') | |
172 | 177 | added = 0 |
173 | 178 | skipped = 0 |
174 | 179 |