Selenium logs, on delete cascade, tasks timeout, parser url host requirement

This commit is contained in:
Luciano Gervasoni
2025-04-07 09:25:43 +02:00
parent 64d2efd314
commit af3d7e030c
6 changed files with 27 additions and 13 deletions

View File

@@ -43,7 +43,7 @@ class StatusPatternMatching(models.Model):
return "{} -> {} [Priority: {}]".format(self.pattern, self.status, self.priority)
class UrlContent(models.Model):
id_url = models.OneToOneField('Urls', models.DO_NOTHING, db_column='id_url', primary_key=True)
id_url = models.OneToOneField('Urls', models.CASCADE, db_column='id_url', primary_key=True)
date_published = models.DateTimeField(blank=True, null=True)
title = models.TextField(blank=True, null=True)
description = models.TextField(blank=True, null=True)
@@ -86,8 +86,8 @@ class Urls(models.Model):
class UrlsDuplicate(models.Model):
id_url_canonical = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url_canonical', primary_key=True) # The composite primary key (id_url_canonical, id_url_duplicated) found, that is not supported. The first column is selected.
id_url_duplicated = models.ForeignKey(Urls, models.DO_NOTHING, db_column='id_url_duplicated', related_name='urlsduplicate_id_url_duplicated_set')
id_url_canonical = models.OneToOneField(Urls, models.CASCADE, db_column='id_url_canonical', primary_key=True) # The composite primary key (id_url_canonical, id_url_duplicated) found, that is not supported. The first column is selected.
id_url_duplicated = models.ForeignKey(Urls, models.CASCADE, db_column='id_url_duplicated', related_name='urlsduplicate_id_url_duplicated_set')
class Meta:
managed = False
@@ -98,10 +98,10 @@ class UrlsDuplicate(models.Model):
return "{} {} ".format(self.id_url_duplicated, self.id_url_canonical)
class UrlsSourceSearch(models.Model):
id_url = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url', primary_key=True) # The composite primary key (id_url, id_source, id_search) found, that is not supported. The first column is selected.
#id_url = models.ForeignKey(Urls, models.DO_NOTHING, db_column='id_url')
id_source = models.ForeignKey(Source, models.DO_NOTHING, db_column='id_source')
id_search = models.ForeignKey(Search, models.DO_NOTHING, db_column='id_search')
id_url = models.OneToOneField(Urls, models.CASCADE, db_column='id_url', primary_key=True) # The composite primary key (id_url, id_source, id_search) found, that is not supported. The first column is selected.
#id_url = models.ForeignKey(Urls, models.CASCADE, db_column='id_url')
id_source = models.ForeignKey(Source, models.CASCADE, db_column='id_source')
id_search = models.ForeignKey(Search, models.CASCADE, db_column='id_search')
class Meta:
managed = False

View File

@@ -10,6 +10,16 @@ class FetchParser():
def __init__(self) -> None:
logger.debug("Initializing Fetcher Parser")
def _post_process_urls(self, raw_urls, obj_search):
# Searching URL Host based? Make sure results belong to that site
if (obj_search.type == Search.TYPE_ENUM.URL_HOST):
# Get clean URL host
url_host_clean = obj_search.search.replace("www.", "").replace("http://", "").replace("https://", "")
# Ensure URL host in URL
raw_urls = [u for u in raw_urls if url_host_clean in u]
return raw_urls
def run(self):
try:
logger.debug("Starting FetchParser.run() for {}")
@@ -39,6 +49,9 @@ class FetchParser():
except Exception as e:
logger.warning("Exception while parsing input URL {}\n{}".format(url_host_protocol, str(e)))
urls_fetched = []
# Post process URLs
urls_fetched = self._post_process_urls(urls_fetched, obj_search)
# Write to DB
DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)

View File

@@ -72,7 +72,7 @@
"queue": "default",
"repeat": null,
"at_front": false,
"timeout": null,
"timeout": 1800,
"result_ttl": 86400,
"cron_string": null,
"scheduled_time": "2025-01-01T00:00:00+00:00",
@@ -135,7 +135,7 @@
"queue": "default",
"repeat": null,
"at_front": false,
"timeout": null,
"timeout": 1800,
"result_ttl": 86400,
"cron_string": null,
"scheduled_time": "2025-01-01T00:00:00+00:00",
@@ -177,7 +177,7 @@
"queue": "default",
"repeat": null,
"at_front": false,
"timeout": null,
"timeout": 1800,
"result_ttl": 86400,
"cron_string": null,
"scheduled_time": "2025-01-01T00:00:00+00:00",