Simon Volpert fediscover / 9b8389a
Keep crawling on transient connection failure, retry failed URLs once Simon Volpert 5 years ago
1 changed file(s) with 15 addition(s) and 10 deletion(s). Raw diff Collapse all Expand all
5656 def save_cache():
5757 if args.dry_run:
5858 return
59 for _file in [urls.done, urls.new, users.done, users.new, blacklist]:
59 for _file in [urls.done, urls.new, urls.failed, users.done, users.new, blacklist]:
6060 try:
6161 _file.path.write_text('\n'.join(_file) + '\n')
6262 except (IOError, OSError) as exc:
142142 verbose('Skipping {}'.format(url))
143143 return
144144 # TODO: Add special clause for "done" URL storage
145 if url in urls.done:
146 urls.done.remove(url)
147 urls.done.append(url)
148145 # If the URL is a profile URL, mark it as seen
149146 if profile_regex.fullmatch(url):
150147 if url in users.done:
156153 stderr('Loading {}'.format(url))
157154 try:
158155 page = load_url(url)
159 except OSError:
160 # TODO: Move into a "failed" queue instead and try again once on next run
161 stderr('Connection timed out')
162 stderr('If this happens repeatedly, try skipping this URL with')
163 stderr(' fediscover crawl --skip')
164 raise SystemExit
156 except OSError as exc:
157 stderr('Error: {}'.format(exc))
158 if url in urls.failed:
159 stderr('Skipping {}'.format(url))
160 urls.done.append(url)
161 else:
162 urls.failed.append(url)
163 urls.new.append(url)
164 continue
165 # Process page and mark URL as complete
165166 scrape_page(page, url)
167 if url in urls.done:
168 urls.done.remove(url)
169 urls.done.append(url)
166170
167171
168172 # Open the working cache files
169173 class urls(object):
170174 done = load_file('urls.done')
171175 new = load_file('urls.new')
176 failed = load_file('urls.failed')
172177 added = 0
173178 skipped = 0
174179