diff --git a/app_selenium/missing_kids.py b/app_selenium/missing_kids.py index 4a97da7..d86027b 100644 --- a/app_selenium/missing_kids.py +++ b/app_selenium/missing_kids.py @@ -12,6 +12,7 @@ class MissingKidsFetcher(): pass def get_missing_kids_urls(self, first_n_pages=-1): + logger.info("Get MissingKids, #pages: {}".format(first_n_pages)) # Poster URL url = "https://www.missingkids.org/gethelpnow/search/poster-search-results" # URLs diff --git a/app_urls/fetcher/models.py b/app_urls/fetcher/models.py index f33dd46..c179ee4 100644 --- a/app_urls/fetcher/models.py +++ b/app_urls/fetcher/models.py @@ -43,7 +43,7 @@ class StatusPatternMatching(models.Model): return "{} -> {} [Priority: {}]".format(self.pattern, self.status, self.priority) class UrlContent(models.Model): - id_url = models.OneToOneField('Urls', models.DO_NOTHING, db_column='id_url', primary_key=True) + id_url = models.OneToOneField('Urls', models.CASCADE, db_column='id_url', primary_key=True) date_published = models.DateTimeField(blank=True, null=True) title = models.TextField(blank=True, null=True) description = models.TextField(blank=True, null=True) @@ -86,8 +86,8 @@ class Urls(models.Model): class UrlsDuplicate(models.Model): - id_url_canonical = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url_canonical', primary_key=True) # The composite primary key (id_url_canonical, id_url_duplicated) found, that is not supported. The first column is selected. - id_url_duplicated = models.ForeignKey(Urls, models.DO_NOTHING, db_column='id_url_duplicated', related_name='urlsduplicate_id_url_duplicated_set') + id_url_canonical = models.OneToOneField(Urls, models.CASCADE, db_column='id_url_canonical', primary_key=True) # The composite primary key (id_url_canonical, id_url_duplicated) found, that is not supported. The first column is selected. + id_url_duplicated = models.ForeignKey(Urls, models.CASCADE, db_column='id_url_duplicated', related_name='urlsduplicate_id_url_duplicated_set') class Meta: managed = False @@ -98,10 +98,10 @@ class UrlsDuplicate(models.Model): return "{} {} ".format(self.id_url_duplicated, self.id_url_canonical) class UrlsSourceSearch(models.Model): - id_url = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url', primary_key=True) # The composite primary key (id_url, id_source, id_search) found, that is not supported. The first column is selected. - #id_url = models.ForeignKey(Urls, models.DO_NOTHING, db_column='id_url') - id_source = models.ForeignKey(Source, models.DO_NOTHING, db_column='id_source') - id_search = models.ForeignKey(Search, models.DO_NOTHING, db_column='id_search') + id_url = models.OneToOneField(Urls, models.CASCADE, db_column='id_url', primary_key=True) # The composite primary key (id_url, id_source, id_search) found, that is not supported. The first column is selected. + #id_url = models.ForeignKey(Urls, models.CASCADE, db_column='id_url') + id_source = models.ForeignKey(Source, models.CASCADE, db_column='id_source') + id_search = models.ForeignKey(Search, models.CASCADE, db_column='id_search') class Meta: managed = False diff --git a/app_urls/fetcher/src/fetch_parser.py b/app_urls/fetcher/src/fetch_parser.py index e8471f6..321c0f7 100644 --- a/app_urls/fetcher/src/fetch_parser.py +++ b/app_urls/fetcher/src/fetch_parser.py @@ -10,6 +10,16 @@ class FetchParser(): def __init__(self) -> None: logger.debug("Initializing Fetcher Parser") + def _post_process_urls(self, raw_urls, obj_search): + # Searching URL Host based? Make sure results belong to that site + if (obj_search.type == Search.TYPE_ENUM.URL_HOST): + # Get clean URL host + url_host_clean = obj_search.search.replace("www.", "").replace("http://", "").replace("https://", "") + # Ensure URL host in URL + raw_urls = [u for u in raw_urls if url_host_clean in u] + + return raw_urls + def run(self): try: logger.debug("Starting FetchParser.run() for {}") @@ -39,6 +49,9 @@ class FetchParser(): except Exception as e: logger.warning("Exception while parsing input URL {}\n{}".format(url_host_protocol, str(e))) urls_fetched = [] + + # Post process URLs + urls_fetched = self._post_process_urls(urls_fetched, obj_search) # Write to DB DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search) diff --git a/app_urls/scheduled_tasks.json b/app_urls/scheduled_tasks.json index a76294d..3a63536 100644 --- a/app_urls/scheduled_tasks.json +++ b/app_urls/scheduled_tasks.json @@ -72,7 +72,7 @@ "queue": "default", "repeat": null, "at_front": false, - "timeout": null, + "timeout": 1800, "result_ttl": 86400, "cron_string": null, "scheduled_time": "2025-01-01T00:00:00+00:00", @@ -135,7 +135,7 @@ "queue": "default", "repeat": null, "at_front": false, - "timeout": null, + "timeout": 1800, "result_ttl": 86400, "cron_string": null, "scheduled_time": "2025-01-01T00:00:00+00:00", @@ -177,7 +177,7 @@ "queue": "default", "repeat": null, "at_front": false, - "timeout": null, + "timeout": 1800, "result_ttl": 86400, "cron_string": null, "scheduled_time": "2025-01-01T00:00:00+00:00", diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml index 12c8bce..a62174c 100644 --- a/docker-compose-dev.yml +++ b/docker-compose-dev.yml @@ -13,9 +13,9 @@ services: shm_size: 512mb environment: - SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE:-4} - - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY:-logs} + - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY:-/opt/logs} ports: - - 80 + - 80:80 dns: - 1.1.1.1 - 1.0.0.1 diff --git a/docker-compose.yml b/docker-compose.yml index cd40205..6cfcacc 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -11,7 +11,7 @@ services: shm_size: 512mb environment: - SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE:-4} - - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY:-logs} + - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY:-/opt/logs} ports: - 80 dns: