Selenium logs, on delete cascade, tasks timeout, parser url host requirement
This commit is contained in:
@@ -12,6 +12,7 @@ class MissingKidsFetcher():
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
def get_missing_kids_urls(self, first_n_pages=-1):
|
def get_missing_kids_urls(self, first_n_pages=-1):
|
||||||
|
logger.info("Get MissingKids, #pages: {}".format(first_n_pages))
|
||||||
# Poster URL
|
# Poster URL
|
||||||
url = "https://www.missingkids.org/gethelpnow/search/poster-search-results"
|
url = "https://www.missingkids.org/gethelpnow/search/poster-search-results"
|
||||||
# URLs
|
# URLs
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ class StatusPatternMatching(models.Model):
|
|||||||
return "{} -> {} [Priority: {}]".format(self.pattern, self.status, self.priority)
|
return "{} -> {} [Priority: {}]".format(self.pattern, self.status, self.priority)
|
||||||
|
|
||||||
class UrlContent(models.Model):
|
class UrlContent(models.Model):
|
||||||
id_url = models.OneToOneField('Urls', models.DO_NOTHING, db_column='id_url', primary_key=True)
|
id_url = models.OneToOneField('Urls', models.CASCADE, db_column='id_url', primary_key=True)
|
||||||
date_published = models.DateTimeField(blank=True, null=True)
|
date_published = models.DateTimeField(blank=True, null=True)
|
||||||
title = models.TextField(blank=True, null=True)
|
title = models.TextField(blank=True, null=True)
|
||||||
description = models.TextField(blank=True, null=True)
|
description = models.TextField(blank=True, null=True)
|
||||||
@@ -86,8 +86,8 @@ class Urls(models.Model):
|
|||||||
|
|
||||||
|
|
||||||
class UrlsDuplicate(models.Model):
|
class UrlsDuplicate(models.Model):
|
||||||
id_url_canonical = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url_canonical', primary_key=True) # The composite primary key (id_url_canonical, id_url_duplicated) found, that is not supported. The first column is selected.
|
id_url_canonical = models.OneToOneField(Urls, models.CASCADE, db_column='id_url_canonical', primary_key=True) # The composite primary key (id_url_canonical, id_url_duplicated) found, that is not supported. The first column is selected.
|
||||||
id_url_duplicated = models.ForeignKey(Urls, models.DO_NOTHING, db_column='id_url_duplicated', related_name='urlsduplicate_id_url_duplicated_set')
|
id_url_duplicated = models.ForeignKey(Urls, models.CASCADE, db_column='id_url_duplicated', related_name='urlsduplicate_id_url_duplicated_set')
|
||||||
|
|
||||||
class Meta:
|
class Meta:
|
||||||
managed = False
|
managed = False
|
||||||
@@ -98,10 +98,10 @@ class UrlsDuplicate(models.Model):
|
|||||||
return "{} {} ".format(self.id_url_duplicated, self.id_url_canonical)
|
return "{} {} ".format(self.id_url_duplicated, self.id_url_canonical)
|
||||||
|
|
||||||
class UrlsSourceSearch(models.Model):
|
class UrlsSourceSearch(models.Model):
|
||||||
id_url = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url', primary_key=True) # The composite primary key (id_url, id_source, id_search) found, that is not supported. The first column is selected.
|
id_url = models.OneToOneField(Urls, models.CASCADE, db_column='id_url', primary_key=True) # The composite primary key (id_url, id_source, id_search) found, that is not supported. The first column is selected.
|
||||||
#id_url = models.ForeignKey(Urls, models.DO_NOTHING, db_column='id_url')
|
#id_url = models.ForeignKey(Urls, models.CASCADE, db_column='id_url')
|
||||||
id_source = models.ForeignKey(Source, models.DO_NOTHING, db_column='id_source')
|
id_source = models.ForeignKey(Source, models.CASCADE, db_column='id_source')
|
||||||
id_search = models.ForeignKey(Search, models.DO_NOTHING, db_column='id_search')
|
id_search = models.ForeignKey(Search, models.CASCADE, db_column='id_search')
|
||||||
|
|
||||||
class Meta:
|
class Meta:
|
||||||
managed = False
|
managed = False
|
||||||
|
|||||||
@@ -10,6 +10,16 @@ class FetchParser():
|
|||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
logger.debug("Initializing Fetcher Parser")
|
logger.debug("Initializing Fetcher Parser")
|
||||||
|
|
||||||
|
def _post_process_urls(self, raw_urls, obj_search):
|
||||||
|
# Searching URL Host based? Make sure results belong to that site
|
||||||
|
if (obj_search.type == Search.TYPE_ENUM.URL_HOST):
|
||||||
|
# Get clean URL host
|
||||||
|
url_host_clean = obj_search.search.replace("www.", "").replace("http://", "").replace("https://", "")
|
||||||
|
# Ensure URL host in URL
|
||||||
|
raw_urls = [u for u in raw_urls if url_host_clean in u]
|
||||||
|
|
||||||
|
return raw_urls
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
try:
|
try:
|
||||||
logger.debug("Starting FetchParser.run() for {}")
|
logger.debug("Starting FetchParser.run() for {}")
|
||||||
@@ -39,6 +49,9 @@ class FetchParser():
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("Exception while parsing input URL {}\n{}".format(url_host_protocol, str(e)))
|
logger.warning("Exception while parsing input URL {}\n{}".format(url_host_protocol, str(e)))
|
||||||
urls_fetched = []
|
urls_fetched = []
|
||||||
|
|
||||||
|
# Post process URLs
|
||||||
|
urls_fetched = self._post_process_urls(urls_fetched, obj_search)
|
||||||
|
|
||||||
# Write to DB
|
# Write to DB
|
||||||
DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
|
DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
|
||||||
|
|||||||
@@ -72,7 +72,7 @@
|
|||||||
"queue": "default",
|
"queue": "default",
|
||||||
"repeat": null,
|
"repeat": null,
|
||||||
"at_front": false,
|
"at_front": false,
|
||||||
"timeout": null,
|
"timeout": 1800,
|
||||||
"result_ttl": 86400,
|
"result_ttl": 86400,
|
||||||
"cron_string": null,
|
"cron_string": null,
|
||||||
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||||
@@ -135,7 +135,7 @@
|
|||||||
"queue": "default",
|
"queue": "default",
|
||||||
"repeat": null,
|
"repeat": null,
|
||||||
"at_front": false,
|
"at_front": false,
|
||||||
"timeout": null,
|
"timeout": 1800,
|
||||||
"result_ttl": 86400,
|
"result_ttl": 86400,
|
||||||
"cron_string": null,
|
"cron_string": null,
|
||||||
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||||
@@ -177,7 +177,7 @@
|
|||||||
"queue": "default",
|
"queue": "default",
|
||||||
"repeat": null,
|
"repeat": null,
|
||||||
"at_front": false,
|
"at_front": false,
|
||||||
"timeout": null,
|
"timeout": 1800,
|
||||||
"result_ttl": 86400,
|
"result_ttl": 86400,
|
||||||
"cron_string": null,
|
"cron_string": null,
|
||||||
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||||
|
|||||||
@@ -13,9 +13,9 @@ services:
|
|||||||
shm_size: 512mb
|
shm_size: 512mb
|
||||||
environment:
|
environment:
|
||||||
- SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE:-4}
|
- SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE:-4}
|
||||||
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY:-logs}
|
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY:-/opt/logs}
|
||||||
ports:
|
ports:
|
||||||
- 80
|
- 80:80
|
||||||
dns:
|
dns:
|
||||||
- 1.1.1.1
|
- 1.1.1.1
|
||||||
- 1.0.0.1
|
- 1.0.0.1
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ services:
|
|||||||
shm_size: 512mb
|
shm_size: 512mb
|
||||||
environment:
|
environment:
|
||||||
- SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE:-4}
|
- SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE:-4}
|
||||||
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY:-logs}
|
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY:-/opt/logs}
|
||||||
ports:
|
ports:
|
||||||
- 80
|
- 80
|
||||||
dns:
|
dns:
|
||||||
|
|||||||
Reference in New Issue
Block a user