Selenium logs, on delete cascade, tasks timeout, parser url host requirement
This commit is contained in:
@@ -12,6 +12,7 @@ class MissingKidsFetcher():
|
||||
pass
|
||||
|
||||
def get_missing_kids_urls(self, first_n_pages=-1):
|
||||
logger.info("Get MissingKids, #pages: {}".format(first_n_pages))
|
||||
# Poster URL
|
||||
url = "https://www.missingkids.org/gethelpnow/search/poster-search-results"
|
||||
# URLs
|
||||
|
||||
@@ -43,7 +43,7 @@ class StatusPatternMatching(models.Model):
|
||||
return "{} -> {} [Priority: {}]".format(self.pattern, self.status, self.priority)
|
||||
|
||||
class UrlContent(models.Model):
|
||||
id_url = models.OneToOneField('Urls', models.DO_NOTHING, db_column='id_url', primary_key=True)
|
||||
id_url = models.OneToOneField('Urls', models.CASCADE, db_column='id_url', primary_key=True)
|
||||
date_published = models.DateTimeField(blank=True, null=True)
|
||||
title = models.TextField(blank=True, null=True)
|
||||
description = models.TextField(blank=True, null=True)
|
||||
@@ -86,8 +86,8 @@ class Urls(models.Model):
|
||||
|
||||
|
||||
class UrlsDuplicate(models.Model):
|
||||
id_url_canonical = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url_canonical', primary_key=True) # The composite primary key (id_url_canonical, id_url_duplicated) found, that is not supported. The first column is selected.
|
||||
id_url_duplicated = models.ForeignKey(Urls, models.DO_NOTHING, db_column='id_url_duplicated', related_name='urlsduplicate_id_url_duplicated_set')
|
||||
id_url_canonical = models.OneToOneField(Urls, models.CASCADE, db_column='id_url_canonical', primary_key=True) # The composite primary key (id_url_canonical, id_url_duplicated) found, that is not supported. The first column is selected.
|
||||
id_url_duplicated = models.ForeignKey(Urls, models.CASCADE, db_column='id_url_duplicated', related_name='urlsduplicate_id_url_duplicated_set')
|
||||
|
||||
class Meta:
|
||||
managed = False
|
||||
@@ -98,10 +98,10 @@ class UrlsDuplicate(models.Model):
|
||||
return "{} {} ".format(self.id_url_duplicated, self.id_url_canonical)
|
||||
|
||||
class UrlsSourceSearch(models.Model):
|
||||
id_url = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url', primary_key=True) # The composite primary key (id_url, id_source, id_search) found, that is not supported. The first column is selected.
|
||||
#id_url = models.ForeignKey(Urls, models.DO_NOTHING, db_column='id_url')
|
||||
id_source = models.ForeignKey(Source, models.DO_NOTHING, db_column='id_source')
|
||||
id_search = models.ForeignKey(Search, models.DO_NOTHING, db_column='id_search')
|
||||
id_url = models.OneToOneField(Urls, models.CASCADE, db_column='id_url', primary_key=True) # The composite primary key (id_url, id_source, id_search) found, that is not supported. The first column is selected.
|
||||
#id_url = models.ForeignKey(Urls, models.CASCADE, db_column='id_url')
|
||||
id_source = models.ForeignKey(Source, models.CASCADE, db_column='id_source')
|
||||
id_search = models.ForeignKey(Search, models.CASCADE, db_column='id_search')
|
||||
|
||||
class Meta:
|
||||
managed = False
|
||||
|
||||
@@ -10,6 +10,16 @@ class FetchParser():
|
||||
def __init__(self) -> None:
|
||||
logger.debug("Initializing Fetcher Parser")
|
||||
|
||||
def _post_process_urls(self, raw_urls, obj_search):
|
||||
# Searching URL Host based? Make sure results belong to that site
|
||||
if (obj_search.type == Search.TYPE_ENUM.URL_HOST):
|
||||
# Get clean URL host
|
||||
url_host_clean = obj_search.search.replace("www.", "").replace("http://", "").replace("https://", "")
|
||||
# Ensure URL host in URL
|
||||
raw_urls = [u for u in raw_urls if url_host_clean in u]
|
||||
|
||||
return raw_urls
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
logger.debug("Starting FetchParser.run() for {}")
|
||||
@@ -39,6 +49,9 @@ class FetchParser():
|
||||
except Exception as e:
|
||||
logger.warning("Exception while parsing input URL {}\n{}".format(url_host_protocol, str(e)))
|
||||
urls_fetched = []
|
||||
|
||||
# Post process URLs
|
||||
urls_fetched = self._post_process_urls(urls_fetched, obj_search)
|
||||
|
||||
# Write to DB
|
||||
DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
|
||||
|
||||
@@ -72,7 +72,7 @@
|
||||
"queue": "default",
|
||||
"repeat": null,
|
||||
"at_front": false,
|
||||
"timeout": null,
|
||||
"timeout": 1800,
|
||||
"result_ttl": 86400,
|
||||
"cron_string": null,
|
||||
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||
@@ -135,7 +135,7 @@
|
||||
"queue": "default",
|
||||
"repeat": null,
|
||||
"at_front": false,
|
||||
"timeout": null,
|
||||
"timeout": 1800,
|
||||
"result_ttl": 86400,
|
||||
"cron_string": null,
|
||||
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||
@@ -177,7 +177,7 @@
|
||||
"queue": "default",
|
||||
"repeat": null,
|
||||
"at_front": false,
|
||||
"timeout": null,
|
||||
"timeout": 1800,
|
||||
"result_ttl": 86400,
|
||||
"cron_string": null,
|
||||
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||
|
||||
@@ -13,9 +13,9 @@ services:
|
||||
shm_size: 512mb
|
||||
environment:
|
||||
- SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE:-4}
|
||||
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY:-logs}
|
||||
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY:-/opt/logs}
|
||||
ports:
|
||||
- 80
|
||||
- 80:80
|
||||
dns:
|
||||
- 1.1.1.1
|
||||
- 1.0.0.1
|
||||
|
||||
@@ -11,7 +11,7 @@ services:
|
||||
shm_size: 512mb
|
||||
environment:
|
||||
- SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE:-4}
|
||||
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY:-logs}
|
||||
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY:-/opt/logs}
|
||||
ports:
|
||||
- 80
|
||||
dns:
|
||||
|
||||
Reference in New Issue
Block a user