Refactoring fetcher, working feeds and raw url writer

This commit is contained in:
Luciano Gervasoni
2025-03-12 17:56:40 +01:00
parent e124dbc21a
commit 61c31ee9aa
24 changed files with 2085 additions and 194 deletions

View File

@@ -118,14 +118,23 @@
" title TEXT,\n",
" description TEXT,\n",
" content TEXT,\n",
" valid_content BOOLEAN,\n",
" language CHAR(2), -- ISO 639-1 Code\n",
" keywords TEXT[],\n",
" tags TEXT[],\n",
" authors TEXT[],\n",
" image_urls TEXT[]\n",
" image_main TEXT,\n",
" images_url TEXT[],\n",
" videos_url TEXT[],\n",
" url_host TEXT, -- www.breitbart.com\n",
" site_name TEXT -- Breitbart News\n",
" );\n",
" CREATE INDEX idx_tags ON URL_CONTENT USING GIN(tags);\n",
" CREATE INDEX idx_authors ON URL_CONTENT USING GIN(authors);\n",
" CREATE INDEX idx_date_published ON URL_CONTENT (date_published);\n",
" CREATE INDEX idx_valid_content ON URL_CONTENT (valid_content);\n",
" CREATE INDEX idx_language ON URL_CONTENT (language);\n",
" CREATE INDEX idx_url_host ON URL_CONTENT (url_host);\n",
" \"\"\")\n",
"\n",
" # Feeds\n",

View File

@@ -101,95 +101,11 @@
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('https://foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert',\n",
" 'foxnews.com')"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# !pip install trafilatura trafilatura[all] cchardet\n",
"import courlan\n",
"url = \"https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n",
"url = \"https://foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n",
"courlan.check_url(url)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"import newspaper\n",
"\n",
"article = newspaper.article(url)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"datetime.datetime(2025, 3, 4, 4, 0, 31, tzinfo=tzoffset(None, -18000))"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"article.publish_date"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# !pip install trafilatura\n",
"import trafilatura\n",
@@ -197,9 +113,18 @@
"\n",
"url = \"https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n",
"# url = \"https://www.missingkids.org/poster/USVA/VA25-0820/1\"\n",
"url = \"https://www.bloomberg.com/news/articles/2025-03-12/eu-launches-metals-tariff-retaliation-on-26-billion-of-us-goods\"\n",
"\n",
"# Fetch\n",
"doc = trafilatura.fetch_url(url)\n",
"doc = trafilatura.fetch_url(url)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Content & metadata\n",
"metadata = trafilatura.extract_metadata(doc)\n",
"content = trafilatura.extract(doc)"
@@ -207,40 +132,9 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'author': 'Audrey Conklin',\n",
" 'body': <Element body at 0x7e22813ce400>,\n",
" 'categories': [],\n",
" 'comments': None,\n",
" 'commentsbody': <Element body at 0x7e22813ce180>,\n",
" 'date': '2025-03-03',\n",
" 'description': \"Disgraced parenting blogger and mom of six Ruby Franke's \"\n",
" '\"power\" and public image\" allowed her crimes against her '\n",
" 'children to go \"unchecked,\" according to a defense attorney.',\n",
" 'filedate': '2025-03-08',\n",
" 'fingerprint': None,\n",
" 'hostname': 'foxnews.com',\n",
" 'id': None,\n",
" 'image': 'https://static.foxnews.com/foxnews.com/content/uploads/2024/03/967e1c1b-Franke.jpg',\n",
" 'language': None,\n",
" 'license': None,\n",
" 'pagetype': 'article',\n",
" 'raw_text': None,\n",
" 'sitename': 'Fox News',\n",
" 'tags': [],\n",
" 'text': None,\n",
" 'title': \"Utah mommy blogger Ruby Franke's power, public image allowed child \"\n",
" \"abuse to go 'unchecked': expert\",\n",
" 'url': 'https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert'}\n"
]
}
],
"outputs": [],
"source": [
"pprint(metadata.as_dict())"
]
@@ -263,85 +157,50 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 18.6 ms, sys: 40 μs, total: 18.7 ms\n",
"Wall time: 18 ms\n"
]
},
{
"data": {
"text/plain": [
"'en'"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"'''\n",
"!pip install lingua-language-detector\n",
"import lingua\n",
"ld = lingua.LanguageDetectorBuilder.from_all_languages().build()\n",
"l = ld.detect_language_of(content)\n",
"'''\n",
"# !pip install newspaper4k\n",
"# !pip install langdetect \n",
"import newspaper\n",
"import langdetect\n",
"langdetect.DetectorFactory.seed = 0\n",
"langdetect.detect(content)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# !pip install newspaper4k"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import newspaper\n",
"\n",
"\n",
"\n",
"# url = \"https://www.missingkids.org/poster/USVA/VA25-0820/1\"\n",
"#url = \"https://www.waff.com/2025/03/11/colbert-heights-high-school-employee-arrested-child-abuse/\"\n",
"\n",
"\n",
"\n",
"#url = \"https://www.bloomberg.com/news/articles/2025-03-12/eu-launches-metals-tariff-retaliation-on-26-billion-of-us-goods\"\n",
"\n",
"\n",
"url = \"https://apnews.com/article/canada-trump-us-tariffs-steel-2517a6a2baf0596cb1a43d3a7d1e7939\"\n",
"url = \"https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n",
"url = \"https://www.missingkids.org/poster/USVA/VA25-0820/1\"\n",
"#url = \"https://www.ft.com/content/6d7c6915-4ceb-43fc-9896-590036b12a87\"\n",
"#url = \"https://www.lanacion.com.ar/politica/milei-en-bahia-blanca-un-viaje-sorpresa-para-frenar-las-criticas-y-mostrar-cercania-nid12032025/\"\n",
"#url = \"https://www.missingkids.org/poster/NCMC/2043547/1\"\n",
"\n",
"article = newspaper.article(url)\n",
"try:\n",
" article = newspaper.article(url)\n",
"except newspaper.ArticleException as e:\n",
" print(\"ArticleException: {}\".format(str(e)))\n",
"except Exception as e:\n",
" print(\"Err: {}\".format(str(e)))\n",
"\n",
"url_photo = set([i for i in article.images if \"api.missingkids.org/photographs\" in i])"
"# url_photo = set([i for i in article.images if \"api.missingkids.org/photographs\" in i])\n",
"# article.is_valid_url(), article.is_parsed, article.is_media_news(), article.is_valid_body()\n",
"article.meta_data\n",
"\n"
]
},
{
@@ -351,6 +210,13 @@
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,

View File

@@ -3,6 +3,36 @@
conda create -n matitos_urls python=3.12
conda activate matitos_urls
pip install django psycopg[binary] django-rq
pip install feedparser python-dateutil newspaper4k lxml[html_clean]
```
* From automated inspectdb
```
# 1) Inspect DB, generate models.py
python manage.py inspectdb
# 2) models.py, within class Urls, add:
class STATUS_ENUM(models.TextChoices):
RAW = "raw"
ERROR = "error"
VALID = "valid"
UNKNOWN = "unknown"
INVALID = "invalid"
DUPLICATE = "duplicate"
# Update status
status = models.TextField(choices=STATUS_ENUM, default=STATUS_ENUM.RAW) # This field type is a guess.
# To class Meta, add default ordering
class Meta:
managed = False
db_table = 'urls' # db_table = '{}_urls'.format(project_name)
ordering = ["-ts_fetch"]
# Fields default:
ts_fetch = models.DateTimeField(auto_now_add=True)
status = models.TextField(default='raw') # This field type is a guess.
```
* Environment variables
@@ -25,10 +55,20 @@ python manage.py makemigrations
python manage.py migrate --fake
```
* Deploy
```
# Server
python manage.py runserver
# Worker
python manage.py rqworker default
while true; do python manage.py rqworker default --burst; sleep 5; done
```
* Utils
```
python manage.py rqstats
python manage.py rqstats --interval=1 # Refreshes every second
python manage.py rqstats --json # Output as JSON
python manage.py rqstats --yaml # Output as YAML
```

View File

@@ -55,8 +55,8 @@ class UrlContent(models.Model):
class Urls(models.Model):
url = models.TextField(unique=True)
ts_fetch = models.DateTimeField()
status = models.TextField() # This field type is a guess.
ts_fetch = models.DateTimeField(auto_now_add=True)
status = models.TextField(default='raw') # This field type is a guess.
class Meta:
managed = False

View File

@@ -0,0 +1,502 @@
import psycopg
import redis
import traceback
import random
import requests
import json
import os
from .url_utils import process_article
from .logger import get_logger
logger = get_logger()
# TODO: URL_DB_HANDLER, _get_search_list, _get_url_host, _get_url_host_list, ...
# The rest, elsewhere
class DB_Handler():
def __init__(self, db_connect_info, redis_connect_info):
logger.debug("Initializing URL DB writer")
self.db_connect_info = db_connect_info
self.redis_instance = redis.Redis(host=redis_connect_info.get("host"), port=redis_connect_info.get("port"))
self.redis_expiry_seconds = redis_connect_info.get("expiry_seconds", 172800) # Default: 48 hours
try:
self.redis_instance.ping()
logger.debug("Succesfully pinged Redis")
except Exception as e:
logger.warning("Error trying to ping Redis: {}".format(str(e)))
def get_urls_count(self, last_minutes_check):
#####################
### Get number of URLs within last X minutes
#####################
try:
# Update
with psycopg.connect(self.db_connect_info) as conn:
# Open cursor
cursor = conn.cursor()
num_urls = cursor.execute("SELECT COUNT(*) FROM URLS WHERE ts_fetch >= current_timestamp - interval '{} minutes';".format(last_minutes_check)).fetchone()[0]
except Exception as e:
logger.warning("Error updating URLs status: {}".format(str(e)))
num_urls = None
return num_urls
def _get_url_host_list(self):
try:
with psycopg.connect(self.db_connect_info) as conn:
# List of URL host
list_url_host = [l[0] for l in conn.execute("SELECT url_host FROM WEBSITE_OF_INTEREST;").fetchall()]
# Clean http / https from URLs
list_url_host = [l.replace("https://", "").replace("http://", "") for l in list_url_host]
# Clean last slash if exists
list_url_host = [ l if not l.endswith("/") else l[:-1] for l in list_url_host]
except Exception as e:
logger.warning("Exception fetching URL host list: " + str(e))
list_url_host = []
return list_url_host
def _get_search_list(self):
try:
with psycopg.connect(self.db_connect_info) as conn:
# List of keyword searches
list_search_text = [l[0] for l in conn.execute("SELECT keyword_search FROM SEARCH;").fetchall()]
except Exception as e:
logger.warning("Exception fetching searches list: " + str(e))
list_search_text = []
return list_search_text
def _get_feed_urls(self):
try:
with psycopg.connect(self.db_connect_info) as conn:
list_url_feeds = conn.execute("SELECT rss_feed FROM FEED;").fetchall()
# Decode (tuple with 1 element)
list_url_feeds = [l[0] for l in list_url_feeds]
except Exception as e:
logger.warning("Exception fetching RSS sites: " + str(e))
list_url_feeds = []
return list_url_feeds
def _get_url_hosts(self):
try:
with psycopg.connect(self.db_connect_info) as conn:
list_url_hosts = conn.execute("SELECT url_host FROM WEBSITE_OF_INTEREST;").fetchall()
# Decode (tuple with 1 element)
list_url_hosts = [l[0] for l in list_url_hosts]
except Exception as e:
logger.warning("Exception fetching RSS sites: " + str(e))
list_url_hosts = []
return list_url_hosts
def _format(self, values):
# Repalce single quote ' with ''. Based on https://stackoverflow.com/a/12320729
# String -> 'string', Int -> '1' (string-based), None -> NULL (no quotes for pgSQL to interpret Null value)
if (type(values) == list) or (type(values) == tuple):
insert_args = "(" + ", ".join([ "NULL" if v is None else "'" + str(v).replace("'", "''") + "'" for v in values]) + ")"
elif (type(values) == str):
insert_args = "({})".format( "NULL" if values is None else "'" + values.replace("'", "''") + "'" )
else:
logger.warning("Error formatting input values: {}".format(values))
assert False
return insert_args
def _get_cached_canonical_url(self, url):
### Redis: URL processed recently? -> Avoid increasing SERIAL counter & efficiency of DB
try:
filter_url = self.redis_instance.get(url)
if (filter_url is not None):
filter_url = filter_url.decode("utf-8")
except Exception as e:
logger.warning("Exception querying Redis: {}".format(str(e)))
filter_url = None
return filter_url
def _update_urls_status(self, dict_status_ids):
#####################
### Update status to array of URL IDs
#####################
try:
# Update
with psycopg.connect(self.db_connect_info) as conn:
# Open cursor
cursor = conn.cursor()
# Autocommit at end of transaction (Atomic insert of URLs and sources)
with conn.transaction() as tx:
for key_status, value_ids in dict_status_ids.items():
cursor.execute("UPDATE URLS SET status='{}' WHERE id IN ({});".format(key_status, ",".join([str(v) for v in value_ids])))
except Exception as e:
logger.warning("Error updating URLs status: {}".format(str(e)))
def _get_missing_kids_urls(self, num_urls=None):
#####################
### Get list of Missing Kids URLs
#####################
try:
missing_kids_ids_and_urls = []
if (num_urls is None):
limit = 500
else:
limit = num_urls
offset = 0
with psycopg.connect(self.db_connect_info) as conn:
# Open cursor
cursor = conn.cursor()
while True:
# Query
missing_kids_ids_and_urls_query = cursor.execute("SELECT id, url, status FROM URLS WHERE url LIKE '%missingkids.org/poster%' ORDER BY ts_fetch DESC LIMIT {} OFFSET {};".format(limit, offset)).fetchall()
# Finished?
if (len(missing_kids_ids_and_urls_query) == 0):
break
# Extend
missing_kids_ids_and_urls = missing_kids_ids_and_urls + missing_kids_ids_and_urls_query
# Offset
offset += len(missing_kids_ids_and_urls_query)
# Stop?
if (num_urls is not None) and (len(missing_kids_ids_and_urls) >= num_urls):
break
except Exception as e:
logger.warning("Error getting Missing Kids URLs: {}".format(str(e)))
missing_kids_ids_and_urls = []
return missing_kids_ids_and_urls
def _get_error_urls(self, num_urls=None):
#####################
### Get list of Missing Kids URLs
#####################
try:
error_urls = []
if (num_urls is None):
limit = 500
else:
limit = num_urls
offset = 0
with psycopg.connect(self.db_connect_info) as conn:
# Open cursor
cursor = conn.cursor()
while True:
# Query
error_urls_query = cursor.execute("SELECT id, url FROM URLS WHERE status='error' ORDER BY ts_fetch DESC LIMIT {} OFFSET {};".format(limit, offset)).fetchall()
# Finished?
if (len(error_urls_query) == 0):
break
# Extend
error_urls = error_urls + error_urls_query
# Offset
offset += len(error_urls_query)
# Stop?
if (num_urls is not None) and (len(error_urls) >= num_urls):
break
except Exception as e:
logger.warning("Error getting Error URLs: {}".format(str(e)))
error_urls = []
return error_urls
def _decode_urls(self, urls_fetched, list_domains_to_filter, list_pattern_status_tuple): # TODO: language for urls_fetched...
"""
# TODO: REFACTOR
For each input url
Already processed?
-> Update on Redis expire time
-> Associate to source
Not processed? Get main URL:
-> URL Canonical valid?
-> Rely on this as main URL
-> URL Canonical not valid?
-> Use input url, unless it's a news.google.com link
-> If news.google.com link, filter out. REDIS?
Main URL processing:
-> Update in REDIS, association url -> url_canonical
-> url != url_canonical: Add in duplicate table
If both != news.google.com
"""
# URLs to insert, URLs duplicated association, URL to Canonical form
list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical = [], [], {}
# URL VS CANONICAL:
# News URL returned: https://news.google.com/articles/CBMifmh0dHBzOi8vd3d3LmJyZWl0YmFydC5jb20vMm5kLWFtZW5kbWVudC8yMDIzLzA0LzAzL2dvdi1kZXNhbnRpcy1zaWducy1iaWxsLW1ha2luZy1mbG9yaWRhLXRoZS0yNnRoLWNvbnN0aXR1dGlvbmFsLWNhcnJ5LXN0YXRlL9IBAA?hl=en-US&gl=US&ceid=US%3Aen
# Corresponds to canonical URL: https://www.breitbart.com/2nd-amendment/2023/04/03/gov-desantis-signs-bill-making-florida-the-26th-constitutional-carry-state/
for url in urls_fetched:
# Domain to filter? Input url
filter_due_to_domain = False
for domain_to_filter in list_domains_to_filter:
if (domain_to_filter in url):
logger.debug("Domain filter applied based on {} for input URL: {}".format(domain_to_filter, url))
filter_due_to_domain = True
if (filter_due_to_domain):
continue
# URL processed recently? -> Filter and avoid increasing SERIAL counter & efficiency of DB
cached_canonical_url = self._get_cached_canonical_url(url)
if (cached_canonical_url is not None):
# Even if url processed, need to add url_canonical to list_filtered_urls, so as to associate search source to canonical URL (canonical is the main URL entry)
dict_full_urls_to_canonical[url] = cached_canonical_url # X -> Y
# If url has been processed, so was its canonical form
logger.debug("Filtering out already inserted (processed) URL and its canonical form: {} {}".format(url, cached_canonical_url))
continue
# Process TODO: Add language...
url_canonical, article_elements, article_status = process_article(url, list_pattern_status_tuple)
# TODO: Store article_elements information to insert into OS after inserted into DB (and therefore having associated url_id)
# Could not retrieve redirection for news.google.com based URL? Continue (avoid inserting in DB)
if (url_canonical is None) and ("news.google.com" in url):
logger.debug("Filtering empty canonical link for base URL based on news.google.com: {}".format(url))
continue
# Canonical URL still news.google.com? Continue (avoid inserting in DB)
if (url_canonical is not None) and ("news.google.com" in url_canonical):
logger.debug("Filtering canonical news.google.com based URL: {}".format(url_canonical))
continue
# Domain to filter? Input canonical_url
filter_due_to_domain = False
for domain_to_filter in list_domains_to_filter:
if (url_canonical is not None) and (domain_to_filter in url_canonical):
filter_due_to_domain = True
if (filter_due_to_domain):
logger.info("Filtering due to domain input URL, Canonical_URL: {} {}".format(url, url_canonical))
continue
if (url_canonical is None) or (article_status == "error"):
logger.debug("Processing failed for URL: {}".format(url))
# Still insert URL with "error"? -> If processed later, might have inconsistent sources (url vs url_canonical). Only store if not news.google.com based
if ("news.google.com" in url) or ("consent.google.com" in url):
logging.debug("Not able to process Google News link, skipping: {}".format(url))
else:
dict_full_urls_to_canonical[url] = url # X -> X
list_insert_url_tuple_args.append( (url, article_status) )
continue
# URL was not processed (not sure canonical yet). Generate URL_CANONICAL <-> URL_ORIGINAL association if they're different
if (url_canonical != url):
list_tuple_canonical_duplicate_urls.append( (url_canonical, url) )
# Dict: url -> canonical (update association)
dict_full_urls_to_canonical[url] = url_canonical # X -> Y or X
# Canonical URL processed recently? -> Filter and avoid increasing SERIAL counter & efficiency of DB
if (self._get_cached_canonical_url(url_canonical) is not None):
# Canonical URL was already processed
logger.debug("Filtering out already inserted (processed) URL canonical: {}".format(url_canonical))
else:
# Insert url_canonical to DB formatted
list_insert_url_tuple_args.append( (url_canonical, article_status) )
# Canonical URL different? Process
if (url_canonical != url):
if ("news.google.com" in url) or ("consent.google.com" in url):
logging.debug("Not adding google.news.com based link, skipping: {}".format(url))
else:
# Fetched url -> duplicate (using canonical as main link)
article_status = "duplicate"
# Insert url (non-canonical) to DB formatted
list_insert_url_tuple_args.append( (url, article_status) )
return list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical
def _insert_urls(self, cursor, list_insert_url_tuple_args):
#####################
### Insert URLs with status
#####################
if (len(list_insert_url_tuple_args) > 0):
insert_args = ', '.join( [ self._format(t) for t in list_insert_url_tuple_args] )
# Insert. (url_1, status_1), (url_2, status_2), ...
sql_code = "INSERT INTO URLS {} VALUES {} ON CONFLICT (url) DO NOTHING;".format("(url, status)", insert_args)
# logger.debug("SQL CODE: {}".format(sql_code))
c = cursor.execute(sql_code)
# NOTE: Not using "RETURNING id" since previously inserted URLs are not returned (ON CONFLICT)
# https://stackoverflow.com/questions/35949877/how-to-include-excluded-rows-in-returning-from-insert-on-conflict/35953488#35953488
def _insert_urls_duplicated(self, cursor, list_tuple_canonical_duplicate_urls):
#####################
### Insert duplicated URLs
#####################
if (len(list_tuple_canonical_duplicate_urls) > 0):
# Flatten, format, set to remove duplicates
args_duplicated_urls_set = "(" + ', '.join( set( [ "'" + str(y).replace("'", "''") + "'" for x in list_tuple_canonical_duplicate_urls for y in x] ) ) + ")"
# Dict: url -> id
dict_url_to_id = {}
# Get url -> id association to populate duplicated URLs
for (id_, url_) in cursor.execute("SELECT id, url FROM URLS WHERE url IN {};".format(args_duplicated_urls_set)).fetchall():
dict_url_to_id[url_] = id_
# Convert tuples (url_canonical, url) -> (id_url_canonical, id_url) to insert in DB
# ORIGINAL CODE. Issue, might not have found association to all urls
### list_tuple_canonical_duplicate_urls_ids = [ (dict_url_to_id[t[0]], dict_url_to_id[t[1]]) for t in list_tuple_canonical_duplicate_urls]
list_tuple_canonical_duplicate_urls_ids = []
for (url_1, url_2) in list_tuple_canonical_duplicate_urls:
id_url_1, id_url_2 = dict_url_to_id.get(url_1), dict_url_to_id.get(url_2)
if (id_url_1 is None) or (id_url_2 is None):
logger.debug("Skipping duplicate association due to no url -> id_url mapping available for tuple: {} {}".format(url_1, url_2))
else:
list_tuple_canonical_duplicate_urls_ids.append( (id_url_1, id_url_2) )
if (len(list_tuple_canonical_duplicate_urls_ids) > 0):
insert_args = ', '.join( [ self._format(t) for t in list_tuple_canonical_duplicate_urls_ids] )
# Insert. (id_url_canonical_1, id_url_1), ...
sql_code = "INSERT INTO URLS_DUPLICATE {} VALUES {} ON CONFLICT DO NOTHING;".format("(id_url_canonical, id_url_duplicated)", insert_args)
# logger.debug("SQL CODE: {}".format(sql_code))
c = cursor.execute(sql_code)
def _get_pattern_status_list(self):
#####################
### Get list of domains to filter
#####################
# TODO: Cache on redis and query once every N hours? ...
try:
with psycopg.connect(self.db_connect_info) as conn:
# Open cursor
cursor = conn.cursor()
# TODO: Cache on Redis
list_pattern_status = cursor.execute("SELECT pattern, priority, status FROM STATUS_PATTERN_MATCHING;").fetchall()
except Exception as e:
logger.warning("Error getting pattern status list: {}".format(str(e)))
list_pattern_status = []
return list_pattern_status
def _get_domains_to_filter(self):
#####################
### Get list of domains to filter
#####################
# TODO: Cache on redis and query once every N hours? ...
try:
with psycopg.connect(self.db_connect_info) as conn:
# Open cursor
cursor = conn.cursor()
# TODO: Cache on Redis
sites_to_filter = [e[0] for e in cursor.execute("SELECT url_host FROM WEBSITE_TO_FILTER;").fetchall() ]
except Exception as e:
logger.warning("Error getting domains to filter: {}".format(str(e)))
sites_to_filter = []
return sites_to_filter
def _get_cached_source_id(self, source):
### Redis: URL processed recently? -> Avoid increasing SERIAL counter & efficiency of DB
try:
source_id = self.redis_instance.get(source)
if (source_id is not None):
source_id = source_id.decode("utf-8")
except Exception as e:
logger.warning("Exception querying Redis: {}".format(str(e)))
source_id = None
return source_id
def _get_source_id(self, cursor, source):
#####################
### Get source corresponding id
#####################
# Cached?
id_source = self._get_cached_source_id(source)
if (id_source is None):
c = cursor.execute("SELECT id FROM SOURCE WHERE source='{}'".format(source.replace("'", "''"))).fetchone()
if (c is None) or (len(c) == 0):
# Source does not exist, insert and get id
c = cursor.execute("INSERT INTO SOURCE (source) VALUES ('{}') RETURNING id;".format(source.replace("'", "''"))).fetchone()
# Decode source id
id_source = c[0]
# Cache
print("*"*10, source, id_source)
self.redis_instance.set(source, id_source, ex=self.redis_expiry_seconds)
return id_source
def _get_urls_id(self, cursor, urls_full):
#####################
### Get id of inserted and filtered URLs
#####################
# TODO: Cache url -> url_id, url_canonical
if (len(urls_full) == 0):
return []
# Get inserted and filtered URL ids (unnested). Filtered URLs are also retrieved since they might have been fetched from a new source
in_inserted_filtered_urls = "(" + ', '.join(["'" + u.replace("'", "''") + "'" for u in urls_full]) + ")"
id_urls_related = [ i[0] for i in cursor.execute("SELECT id FROM URLS WHERE url IN {};".format(in_inserted_filtered_urls)).fetchall() ]
return id_urls_related
def _insert_urls_source(self, cursor, id_urls_related, id_source):
#####################
### Insert URL sources: (id_url_1, id_source), (id_url_2, id_source), ...
#####################
if (len(id_urls_related) == 0) or (id_source is None):
return
columns = "(id_url, id_source)"
insert_args = ', '.join( [ self._format([id_url, id_source]) for id_url in id_urls_related ] )
# Insert
sql_code = "INSERT INTO URLS_SOURCE {} VALUES {} ON CONFLICT DO NOTHING;".format(columns, insert_args)
# logger.debug("SQL CODE: {}".format(sql_code))
c = cursor.execute(sql_code)
def write_batch(self, urls_fetched, source):
# Chunks of 50 elements
n = 50
# Divide in small chunks
urls_fetched_chunks = [urls_fetched[i:i + n] for i in range(0, len(urls_fetched), n)]
# Process
for urls_fetched_chunk_i in urls_fetched_chunks:
self._write_small_batch(urls_fetched_chunk_i, source)
def _write_small_batch(self, urls_fetched, source):
try:
logger.info("Fetched #{} URLs, source: {}".format(len(urls_fetched), source))
if (len(urls_fetched) == 0):
logger.debug("Empty batch of urls (not writing to DB) for source: {}".format(source))
return
# Shuffle URLs to reduce continuous URLs of same URL host (minimize chance of being blocked for too many continuous requests)
random.shuffle(urls_fetched)
# Get list of domains to filter
list_domains_to_filter = self._get_domains_to_filter()
# Get list of (pattern, priority, status) tuples to override status if required
list_pattern_status_tuple = self._get_pattern_status_list()
# Sort pattern tuples by priority
list_pattern_status_tuple.sort(key=lambda tup: tup[1], reverse=True)
# Process URLs to update DB
list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical = self._decode_urls(urls_fetched, list_domains_to_filter, list_pattern_status_tuple)
# Full set of URL and its canonical form (to associate them to a search), both to insert and filter
urls_full = set(dict_full_urls_to_canonical.keys()).union( set(dict_full_urls_to_canonical.values()) )
# Insert
with psycopg.connect(self.db_connect_info) as conn:
# Open cursor
cursor = conn.cursor()
# Autocommit at end of transaction (Atomic insert of URLs and sources)
with conn.transaction() as tx:
# Insert processed URLs
self._insert_urls(cursor, list_insert_url_tuple_args)
# Insert URLs duplicated (canonical != fetched url)
self._insert_urls_duplicated(cursor, list_tuple_canonical_duplicate_urls)
# Get source id in DB
id_source = self._get_source_id(cursor, source)
# Get IDs of all related URLs
id_urls_related = self._get_urls_id(cursor, urls_full)
# Insert search source associated to URLs
self._insert_urls_source(cursor, id_urls_related, id_source)
# Update Redis status of inserted and filtered URLs after writing to DB
for url, url_canonical in dict_full_urls_to_canonical.items():
try:
# Set with updated expiry time
self.redis_instance.set(url, url_canonical, ex=self.redis_expiry_seconds)
if (url != url_canonical):
self.redis_instance.set(url_canonical, url_canonical, ex=self.redis_expiry_seconds)
except Exception as e:
logger.warning("Exception running set in Redis: {}".format(str(e)))
if (len(list_insert_url_tuple_args) > 0):
try:
webhook_token = os.environ.get("CLIQ_WEBHOOK_TOKEN")
endpoint_message = "https://cliq.zoho.com/api/v2/channelsbyname/urlretrievalbot/message?zapikey={}".format(webhook_token)
payload = json.dumps({"text": "Fetched #{} new URLs, source: {}".format(len(list_insert_url_tuple_args), source) })
r = requests.post(endpoint_message, data=payload)
except Exception as e:
logger.warning("Webhook failed: {}".format(str(e)))
logger.debug("URL DB write finished")
except Exception as e:
logger.warning( "Exception writing to URL_DB:\n{}".format(traceback.format_exc()) )
logger.debug( "Exception --- List of URLs: {}".format(str(urls_fetched)) )

View File

@@ -0,0 +1,48 @@
from .db_utils import DB_Handler
import feedparser
import dateutil
from .logger import get_logger
logger = get_logger()
class FetchFeeds():
def __init__(self, db_handler: DB_Handler) -> None:
logger.debug("Initializing News feed")
self.db_handler = db_handler
def run(self):
try:
logger.debug("Starting NewsFeed.run()")
# Get feeds
list_url_feeds = self.db_handler._get_feed_urls()
logger.debug("Fetching news from feeds: {}".format(str(list_url_feeds)))
# Process via RSS feeds
for url_feed in list_url_feeds:
# Initialize
urls_fetched, urls_publish_date = [], []
# Fetch feeds
feeds = feedparser.parse(url_feed)
# Parse
for f in feeds.get("entries", []):
# Get URL
url = f.get("link", None)
# Process?
if (url is not None):
# Available publish date?
publish_date_parsed = f.get("published_parsed")
if (publish_date_parsed is None):
publish_date = f.get("published", None)
if (publish_date is not None):
publish_date_parsed = dateutil.parser.parse(publish_date)
# Published date
urls_publish_date.append(publish_date_parsed)
# URL
urls_fetched.append(url)
# URL fetching source
source = "feed {}".format(url_feed)
# Write to DB
self.db_handler.write_batch(urls_fetched, source)
except Exception as e:
logger.warning("Exception in NewsFeed.run(): {}".format(str(e)))

View File

@@ -0,0 +1,45 @@
from .db_utils import DB_Handler
import newspaper
from .logger import get_logger
logger = get_logger()
class FetchParser():
def __init__(self, db_handler: DB_Handler) -> None:
logger.debug("Initializing News SiteParsing newspaper4k")
self.db_handler = db_handler
# TODO: MOVE LOGIC ELSEWHERE!
def _postprocess(self, article_urls):
return [url.replace("#comment-stream", "") for url in article_urls]
def run(self):
try:
logger.debug("Starting NewsSiteParsing.run() for {}")
# Get URL hosts
list_url_hosts = self.db_handler._get_url_hosts()
logger.info("Fetching news by parsing URL hosts: {}".format(str(list_url_hosts)))
# Process newspaper4k build method
for url_host_feed in list_url_hosts:
# Protocol
if not (url_host_feed.startswith("http")):
url_host_feed_formatted = "https://" + url_host_feed
else:
url_host_feed_formatted = url_host_feed
logger.debug("Fetching newspaper4k parsing based on URL: {}".format(url_host_feed_formatted))
# Source object
url_host_built = newspaper.build(url_host_feed_formatted)
# Get articles URL list
urls_fetched = url_host_built.article_urls()
# TODO: MOVE!
# Post-processing
urls_fetched = self._postprocess(urls_fetched)
# URL fetching source
source = "newspaper4k {}".format(url_host_feed)
# Write to DB
self.db_handler.write_batch(urls_fetched, source)
except Exception as e:
logger.warning("Exception in NewsSiteParsing.run(): {}".format(str(e)))

View File

@@ -0,0 +1,73 @@
from .db_utils import DB_Handler
from .utils import get_searxng_instances
from .fetch_search_sources import FetcherDuckDuckGo, FetcherGNews, FetcherGoogleNews, FetcherSearxNews, FetcherPreSearch
from .logger import get_logger
logger = get_logger()
class FetchSearcher():
def __init__(self, db_handler: DB_Handler, full=True) -> None:
logger.debug("Initializing News feed")
self.db_handler = db_handler
self.full_search = full
def _run_fetching(self, search_text):
logger.debug("Starting _run_fetching() for {}".format(search_text))
# Common parameters
lang, region = "en", "US"
### PreSearch
dict_params_news = {"search": search_text}
FetcherPreSearch(**dict_params_news).fetch_articles(self.db_handler)
### DuckDuckGo
period = "d"
dict_params_news = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "news", "period": period}
FetcherDuckDuckGo(**dict_params_news).fetch_articles(self.db_handler)
dict_params_general = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "general", "period": period}
FetcherDuckDuckGo(**dict_params_general).fetch_articles(self.db_handler)
if (self.full_search):
# Avoid site:{} search due to G-Bypass required time
if ("site:" not in search_text):
### GNews
dict_params = {"search": search_text, "lang": "wt", "region": "wt", "period": period}
FetcherGNews(**dict_params).fetch_articles(self.db_handler)
### GoogleNews
dict_params_news = {"search": search_text, "lang": lang, "region": region, "search_category": "news", "period": period}
FetcherGoogleNews(**dict_params_news).fetch_articles(self.db_handler)
# dict_params_general = {"search": search_text, "lang": lang, "region": region, "search_category": "general", "period": period}
if False:
### SearxNG
period = "day"
for searx_instance in get_searxng_instances():
dict_params_news = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "news", "period": period}
dict_params_general = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "general", "period": period}
# Append thread
FetcherSearxNews(**dict_params_news).fetch_articles(self.db_handler)
FetcherSearxNews(**dict_params_general).fetch_articles(self.db_handler)
logger.debug("Finished _run_fetching()")
def run(self):
try:
logger.info("Fetching text searches & URL hosts of interest")
# Get text searches of interest
list_search_text_of_interest = self.db_handler._get_search_list()
# Get URL host of interest
list_url_host = self.db_handler._get_url_host_list()
# Get text searches for URL hosts
list_search_text_url_host = ["site:{}".format(l) for l in list_url_host]
for search_text in list_search_text_of_interest + list_search_text_url_host:
logger.debug("Fetching news for search: {}".format(search_text))
self._run_fetching(search_text)
logger.info("Finished fetching text searches & URL hosts of interest")
except Exception as e:
logger.warning("Exception in NewsSearch.run(): {}".format(str(e)))

View File

@@ -0,0 +1,384 @@
from duckduckgo_search import DDGS
from gnews import GNews
from GoogleNews import GoogleNews
import requests
from bs4 import BeautifulSoup
import os
import time
import json
import numpy as np
import random
from .google_bypass import GoogleByPass
from abc import ABC, abstractmethod
from .logger import get_logger
logger = get_logger()
# Generic fetcher (fetches articles, writes to DB)
class FetcherAbstract(ABC):
@abstractmethod
def _fetch(self):
pass
def fetch_articles(self, db_writer):
logger.debug("Starting fetch() for {}".format(self.name))
# Fetch articles
list_news = self._fetch()
logger.info("Found #{} articles for search: {}".format(len(list_news), self.name))
# Write to DB
db_writer.write_batch(list_news, self.name)
# https://techblog.willshouse.com/2012/01/03/most-common-user-agents/
user_agents_list = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/111.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; rv:111.0) Gecko/20100101 Firefox/111.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0",
"Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.44",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 OPR/96.0.0.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 OPR/97.0.0.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48",
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.34",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.39",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.2 Safari/605.1.15",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; rv:112.0) Gecko/20100101 Firefox/112.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.51",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/112.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/110.0",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0",
"Mozilla/5.0 (X11; CrOS x86_64 14541.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 YaBrowser/23.3.0.2246 Yowser/2.5 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.6.1 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 6.1; rv:102.0) Gecko/20100101 Goanna/6.0 Firefox/102.0 PaleMoon/32.0.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.41",
"Mozilla/5.0 (Windows NT 10.0; rv:110.0) Gecko/20100101 Firefox/110.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 YaBrowser/23.1.5.708 Yowser/2.5 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
]
class FetcherPreSearch(FetcherAbstract):
def __init__(self, search):
"""
# period ->
- h = hours (eg: 12h)
- d = days (eg: 7d)
- m = months (eg: 6m)
- y = years (eg: 1y)
"""
self.search = search
self.period = "1d" # TODO Fixed for the moment
# self.lang = lang
# self.region = region
search_category = "news"
self.name = "presearch {} {} {}".format(search, search_category, self.period)
def _fetch(self):
try:
# PreSearch fetching endpoint, parameter search keyword
presearch_fetch_endpoint = "http://selenium_app:80/fetch_presearch/?search_keyword={}".format(self.search)
# Timeout: 15 minutes
r = requests.get(presearch_fetch_endpoint, timeout=900)
# Decode
list_news = json.loads(r.text).get("list_urls", [])
except Exception as e:
logger.warning("Timeout on request: {}. {}".format(presearch_fetch_endpoint, str(e)))
list_news = []
return list_news
class FetcherGNews(FetcherAbstract):
def __init__(self, search, period, lang="en", region="US"):
"""
# period ->
- h = hours (eg: 12h)
- d = days (eg: 7d)
- m = months (eg: 6m)
- y = years (eg: 1y)
"""
self.search = search
self.period = period
self.lang = lang
self.region = region
search_category = "news"
self.name = "gnews {} {} {} {}".format(search, search_category, period, "{}-{}".format(lang, region))
def _fetch(self):
try:
list_dict_news = GNews(self.lang, self.region, period=self.period).get_news(self.search)
# Decode
list_news = []
for l in list_dict_news:
list_news.append(l.get("url"))
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
list_news = []
# Bypass Google links
list_news_redirections = GoogleByPass().bypass_google_urls(list_news)
return list_news_redirections
class FetcherGoogleNews(FetcherAbstract):
def __init__(self, search, search_category="news", period="1d", lang="en", region="US"):
assert(search_category in ["news", "general"])
self.lang = lang
self.region = region
self.period = period
self.search_category = search_category
self.search = search
self.name = "googlenews {} {} {} {}".format(search, search_category, period, "{}-{}".format(lang, region))
def _fetch(self):
try:
# Initialize
g = GoogleNews(encode="utf-8", period=self.period, lang=self.lang, region=self.region)
g.enableException(True)
if (self.search_category == "general"):
set_links = set()
# Search
g.search(self.search)
# Iterate pages
MAX_ITER_PAGES = 15
for i in range(MAX_ITER_PAGES):
time.sleep(random.uniform(1, 1.5))
num_before = len(set_links)
# Get page
try:
links = g.page_at(i)
except Exception as e:
logger.warning("Exception fetching page in GoogleNews {}: {}".format(self.name, str(e)))
break
# Links
for l in links:
# '/url?esrc=s&q=&rct=j&sa=U&url=https://www.breitbart.com/news/scent-of-luxury-indias-jasmine-infuses-global-perfume/&ved=2ahUKEwjOybGSiN-AAxX1gv0HHfqSBpMQxfQBegQICBAC&usg=AOvVaw06GdoHyzPbIopUaEuUSQPQ'
url = l.get("link").split("url=")[-1]
set_links.add(url)
num_after = len(set_links)
# Finished?
if (num_before == num_after):
logger.debug("Iterated {} pages on GoogleNews general search".format(i))
break
# To list
list_news = list(set_links)
elif (self.search_category == "news"):
# Search
g.get_news(self.search)
# Fetch
list_news = g.get_links()
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
list_news = []
# Bypass Google links
list_news_redirections = GoogleByPass().bypass_google_urls(list_news)
return list_news_redirections
class FetcherDuckDuckGo(FetcherAbstract):
def __init__(self, search, search_category, period, lang="wt", region="wt"):
assert(search_category in ["news", "general"])
assert(period in ["d", "w", "m", "y"])
self.search = search
self.search_category = search_category
self.period = period
self.lang_region = "{}-{}".format(lang, region)
self.name = "duckduckgo {} {} {} {}".format(search, search_category, "1{}".format(period), region)
def _fetch(self):
try:
list_news = []
with DDGS(timeout=10) as ddgs:
if (self.search_category == "general"):
generator_links = ddgs.text(keywords=self.search, timelimit=self.period, region=self.lang_region)
elif (self.search_category == "news"):
generator_links = ddgs.news(keywords=self.search, timelimit=self.period, region=self.lang_region)
for l in generator_links:
list_news.append( l.get("url", l.get("href")) )
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
list_news = []
return list_news
class FetcherSearxNews(FetcherAbstract):
def __init__(self, search="child abuse", searx_instance="https://serx.ml/", lang="en", region="US", search_category="news", period="day"):
assert(search_category in ["news", "general"])
assert(period in [None, "day", "week", "month", "year"])
# Random header (minimize prob of web-scrapping detection)
self.headers = {
'User-agent': str(np.random.choice(user_agents_list)),
'Accept-Encoding': 'gzip, deflate',
'Accept': '*/*',
'Connection': 'keep-alive',
}
""" # Optional header
self.headers = {
'User-agent': str(np.random.choice(user_agents_list)),
'Accept-Encoding': 'gzip, deflate, br',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'TE': 'trailers',
'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Dest': 'document',
}
"""
self.search = search
self.searx_instance = searx_instance
self.lang_region = "{}-{}".format(lang, region)
self.search_category = search_category
self.period = period
self.t_sleep_lower, self.t_sleep_higher = 0.5, 1.5
self.request_timeout = 240
period_name_mapping = {
None: "no_date_range",
"day": "1d",
"week": "1w",
"month": "1m",
"year": "1y",
}
self.name = "searxng {} {} {} {} {}".format(searx_instance.replace("https://", "").replace("/", ""), search, search_category, period_name_mapping[period], self.lang_region)
logger.info("SearX - Initialized SearX fetcher: {}".format(self.name))
def _request_and_decode(self, url_search):
# Initial random time sleep (minimize chance of getting blocked)
time.sleep(random.uniform(self.t_sleep_lower, self.t_sleep_higher))
# Request
logger.debug("SearX - Searching: {}".format(url_search))
try:
r = requests.get(url_search, headers=self.headers, timeout=self.request_timeout)
except Exception as e:
logger.warning("SearX - Exception in request: {}".format(url_search), "\n", str(e))
return []
if (r.status_code == 200):
# Status code Ok
pass
elif (r.status_code == 429):
# TooManyRequests, "Rate limit exceeded"
logger.warning("SearX {} - Too many requests while running: {}. Request output: {}".format(self.name, r.url, r.text))
return []
elif (r.status_code != 200):
logger.warning("SearX {} - Status code: {}. Request output: {}".format(self.name, r.status_code, r.text))
return []
else:
logger.debug("SearX - Status code: {}".format(r.status_code))
# Decode request
soup = BeautifulSoup(r.text, 'html.parser')
page_url_set = set()
# h3 links
for elem in soup.find_all('h3'):
# Get url
url = elem.find('a').get('href')
page_url_set.add(url)
return page_url_set
def _get_news_list(self):
############################################################
# Domain & search parameter
search_domain = os.path.join(self.searx_instance, "search?q=")
# Search keywords
search_formatted = self.search.replace(" ", "+").replace(":", "%3A")
# Period formatted
period_formatted = "&time_range={}".format(self.period) if self.period is not None else ""
# Search parameters
search_parameters = "&category_{}=on&language={}{}".format(self.search_category, self.lang_region, period_formatted)
# Combined url search
url_search_nopage = "{}{}{}".format(search_domain, search_formatted, search_parameters)
############################################################
# Request and decode on page=1
url_set = self._request_and_decode(url_search_nopage)
# No results?
if (len(url_set) == 0):
logger.warning("SearX {} - Empty results on search: {}".format(self.name, url_search_nopage))
return []
# Iterate pages
search_numpage = 2
while True:
# Combine url search with page number
url_search_with_page = "{}&pageno={}".format(url_search_nopage, search_numpage)
# Request and decode on page=X
url_set_i = self._request_and_decode(url_search_with_page)
# Length before merging
length_current = len(url_set)
# Merge
url_set = url_set.union(url_set_i)
# Length after merging
length_merged = len(url_set)
# No new elements?
if (length_current == length_merged):
logger.debug("SearX {} - Finished processing search, #pages: {}".format(self.name, search_numpage))
break
# Next page
search_numpage += 1
return list(url_set)
def _fetch(self):
try:
# Fetch news
list_news = self._get_news_list()
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
list_news = []
return list_news

View File

@@ -0,0 +1,26 @@
import requests
import json
from .logger import get_logger
logger = get_logger()
class GoogleByPass():
def __init__(self) -> None:
pass
def bypass_google_urls(self, list_urls):
if (len(list_urls) == 0):
return []
try:
# Endpoint
gbypass_endpoint = "http://selenium_app:80/get_redirection"
# Timeout: 20 minutes
timeout = 60*20
r = requests.post(gbypass_endpoint, json={"list_urls": list_urls}, timeout=timeout)
# Decode
list_urls_redirections = json.loads(r.text).get("list_urls_redirections", [])
except Exception as e:
logger.warning("Exception on request: {}. {}".format(gbypass_endpoint, str(e)))
list_urls_redirections = []
return list_urls_redirections

View File

@@ -0,0 +1,22 @@
import logging
import os
os.makedirs("logs", exist_ok=True)
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
logger = logging.getLogger("news_fetcher")
logger.setLevel(logging.INFO)
# To file log: INFO / WARNING / ERROR
fh = logging.handlers.RotatingFileHandler(filename="logs/log_app_fetcher.log", mode="a", maxBytes=10000000, backupCount=4)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
logger.addHandler(fh)
# To file log: WARNING / ERROR
fh_ = logging.handlers.RotatingFileHandler(filename="logs/log_app_fetcher_error.log", mode="a", maxBytes=10000000, backupCount=1)
fh_.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh_.setLevel(logging.WARNING)
logger.addHandler(fh_)
def get_logger():
return logger

View File

@@ -0,0 +1,36 @@
from .db_utils import DB_Handler
import requests
import json
from .logger import get_logger
logger = get_logger()
class MissingKidsFetch():
def __init__(self, db_handler: DB_Handler, num_pages) -> None:
logger.debug("Initializing News MissingKids")
self.db_handler = db_handler
self.num_pages = num_pages
self.missingkids_fetch_endpoint = "http://selenium_app:80/get_missing_kids/?pages={}"
def run(self):
try:
logger.debug("Starting NewsMissingKids.run()")
try:
# Timeout
if (self.num_pages > 15):
timeout = 60*90 # 1.5h
else:
timeout = 60*5 # 5 min
# Request
r = requests.get(self.missingkids_fetch_endpoint.format(self.num_pages), timeout=timeout)
# Decode
urls_fetched = json.loads(r.text).get("list_urls", [])
except Exception as e:
logger.warning("Timeout on request: {}. {}".format(missingkids_fetch_endpoint, str(e)))
urls_fetched = []
# URL fetching source
source = "missingkids fetcher"
# Write to DB
self.db_handler.write_batch(urls_fetched, source)
except Exception as e:
logger.warning("Exception in NewsMissingKids.run(): {}".format(str(e)))

View File

@@ -0,0 +1,98 @@
from .db_utils import URL_DB_Writer
from .url_utils import get_missing_kid_status
from .logger import get_logger
logger = get_logger()
def get_missing_kid_status(url, return_canonical_url=False):
import time
import requests
# Sleep
time.sleep(0.75)
try:
# Request
r = requests.get(url, timeout=300)
# Decode
status_code = r.status_code
# Canonical URL removing parameters
url_canonical = r.url
except Exception as e:
logger.warning("Exception on get URL status request: {}. {}".format(url, str(e)))
status_code = None
url_canonical = url
if (status_code == 200):
status = "valid"
elif (status_code == 404):
status = "invalid"
else:
status = "unknown"
logger.debug("Missing Kid URL {} status: {}".format(url, status))
if (return_canonical_url):
return status, url_canonical
else:
return status
class MissingKidsStatus():
def __init__(self, db_connect_info, redis_connect_info, num_urls) -> None:
self.num_urls = num_urls
self.db_connect_info = db_connect_info
self.redis_connect_info = redis_connect_info
self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info)
def update_missing_kids_status(self):
try:
logger.info("Starting updating status to Missing Kids URLs, limit #URLs: {}".format(self.num_urls))
# List of URLs
list_ids_and_urls = self.db_writer._get_missing_kids_urls(self.num_urls)
# Dict: status -> IDs to update to new status
dict_status_ids, dict_status_urls = {}, {}
# Check URLs with invalid status?
skip_invalid_check = False
flush_every, flush_current = 20, 0
# Iterate URLs
for (id, url, current_status) in list_ids_and_urls:
# Skip duplicate URLs
if (current_status == "duplicate"):
continue
# Skip invalid URLs?
if (skip_invalid_check):
if (current_status == "invalid"):
continue
# Get status
new_status = get_missing_kid_status(url)
# Different? Update
if (current_status != new_status):
# Extend array
dict_status_ids[new_status] = dict_status_ids.get(new_status, []) + [id]
# Debugging dict
dict_status_urls[new_status] = dict_status_urls.get(new_status, []) + [url]
# +1 processed
flush_current += 1
# Flush batch?
if (flush_every == flush_current):
logger.info("Updating status to Missing Kids URLs: {}".format(dict_status_urls))
# Update DB
self.db_writer._update_urls_status(dict_status_ids)
# Reset
flush_current = 0
dict_status_ids, dict_status_urls = {}, {}
# Flush remaining batch
if (flush_current > 0):
logger.info("Updating status to Missing Kids URLs: {}".format(dict_status_urls))
# Update DB
self.db_writer._update_urls_status(dict_status_ids)
# Reset
flush_current = 0
dict_status_ids, dict_status_urls = {}, {}
logger.info("Finished updating status to Missing Kids URLs")
except Exception as e:
logger.warning("Exception in MissingKidsStatus.run(): {}".format(str(e)))

View File

@@ -0,0 +1,62 @@
from .db_utils import URL_DB_Writer
from .url_utils import process_article
from .logger import get_logger
logger = get_logger()
class UpdateErrorURLs():
def __init__(self, db_connect_info, redis_connect_info, num_urls) -> None:
self.num_urls = num_urls
self.db_connect_info = db_connect_info
self.redis_connect_info = redis_connect_info
self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info)
def update_error_urls_status(self):
try:
logger.info("Starting updating status to URLs with error, limit #URLs: {}".format(self.num_urls))
# List of URLs with status 'error'
list_ids_and_urls = self.db_writer._get_error_urls(self.num_urls)
# Current status
current_status = "error"
# Dict: status -> IDs to update to new status
dict_status_ids, dict_status_urls = {}, {}
# Get list of (pattern, priority, status) tuples to override status if required
list_pattern_status_tuple = self.db_writer._get_pattern_status_list()
# Sort pattern tuples by priority
list_pattern_status_tuple.sort(key=lambda tup: tup[1], reverse=True)
flush_every, flush_current = 20, 0
# Iterate URLs
for (id, url) in list_ids_and_urls:
# Get status
url_canonical, article_elements, new_status = process_article(url, list_pattern_status_tuple)
# Different? Update
if (current_status != new_status):
# Extend array
dict_status_ids[new_status] = dict_status_ids.get(new_status, []) + [id]
# Debugging dict
dict_status_urls[new_status] = dict_status_urls.get(new_status, []) + [url]
# +1 processed
flush_current += 1
# Flush batch?
if (flush_every == flush_current):
logger.info("Updating status to URLs with error: {}".format(dict_status_urls))
# Update DB
self.db_writer._update_urls_status(dict_status_ids)
# Reset
flush_current = 0
dict_status_ids, dict_status_urls = {}, {}
# Flush remaining batch
if (flush_current > 0):
logger.info("Updating status to URLs with error: {}".format(dict_status_urls))
# Update DB
self.db_writer._update_urls_status(dict_status_ids)
# Reset
flush_current = 0
dict_status_ids, dict_status_urls = {}, {}
logger.info("Finished updating status to URLs with error")
except Exception as e:
logger.warning("Exception in UpdateErrorURLs.run(): {}".format(str(e)))

View File

@@ -0,0 +1,263 @@
from gnews import GNews
import dateutil.parser
from datetime import datetime, timedelta
from .utils import remove_http_s
import time
import random
import traceback
import requests
import json
import re
from bs4 import BeautifulSoup
from .logger import get_logger
logger = get_logger()
def get_published_date(article):
try:
"""
# Already fetched publish date information?
if (publish_date_ is not None):
return publish_date_
"""
# List of potential publish dates
potential_dates = []
# Publish date is the best match
potential_dates.append(article.publish_date)
# Publish date metadata is the following best match
potential_dates.append(article.meta_data.get('article', {}).get("published_time", None))
# Iterate remaining keys
for key in article.meta_data.keys():
if ("date" in key):
potential_dates.append(article.meta_data[key])
def invalid_date(p_date):
# Today + 2 days, article from the future?
today_plus_two = datetime.utcnow() + timedelta(days=2)
# Article from the future?
return p_date.timestamp() > today_plus_two.timestamp()
for date_ in potential_dates:
# String date? parse
if (type(date_) == str):
try:
date_ = dateutil.parser.parse(date_)
except Exception as e:
logger.info("Invalid date found while parsing potential date: {} for URL: {}".format(date_, article.url))
date_ = None
# Valid?
if (date_ is not None) and (not invalid_date(date_)):
return date_
logger.debug("Article with no published date: {}".format(article.url))
return None
except Exception as e:
logger.info("Error while retrieving published date for URL: {}".format(article.url))
return None
def get_url_host(article_source_url, url):
# https://www.blabla.com/blabla -> www.blabla.com
if (article_source_url != ""):
# Article source URL already extracted, save path if any
return remove_http_s(article_source_url) # .split("/")[0]
else:
return remove_http_s(url).split("/")[0]
def get_status_pattern_matching(url, article_status, list_pattern_status_tuple):
# Regex pattern to update status on "valid", "invalid", and "unknown" status only
# Status "raw", "duplicated" and "error" should remain the way they are
# Assumption: List of patterns sorted by importance
if (article_status in ["valid", "invalid", "unknown"]):
# Regular expression pattern matching: https://regexr.com/
for regex_pattern, regex_priority, status_if_match in list_pattern_status_tuple:
# Matching?
matching = bool(re.match(regex_pattern, url))
# Update article status
if (matching):
if (status_if_match != article_status):
logger.debug("Regex pattern found, updating status from '{}' to '{}' for URL: {}".format(article_status, status_if_match, url))
return status_if_match
# Pattern matching not required or not found, original article status
return article_status
def bypass_google_link(article_url):
def bypass_google_consent(article_url):
# Sample URL: https://consent.google.com/m?continue=https://news.google.com/rss/articles/CBMiMGh0dHBzOi8vd3d3Lm1pc3NpbmdraWRzLm9yZy9wb3N0ZXIvbmNtYy84NjAxMTkvMdIBAA?oc%3D5&gl=NL&m=0&pc=n&cm=2&hl=en-US&src=1
article_url_no_consent = article_url.replace("https://consent.google.com/m?continue=", "")
# https://stackoverflow.com/questions/76063646/how-can-i-have-redirection-link-from-google-news-link-using-requests
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}
cookies = {'CONSENT': 'YES+cb.20220419-08-p0.cs+FX+111'}
try:
# Request
r = requests.get(article_url_no_consent, headers=headers, cookies=cookies, timeout=300)
# Decode
soup = BeautifulSoup(r.text, 'html.parser')
url_of_interest = soup.a['href']
except Exception as e:
logger.warning("Exception on request trying to G_bypass with headers: {}. {}".format(article_url_no_consent, str(e)))
url_of_interest = None
# Not able to bypass?
if (url_of_interest == "") or ("support.google.com" in url_of_interest) or ("news.google.com" in url_of_interest):
url_of_interest = None
return url_of_interest
def bypass_google_using_service(article_url):
try:
# e.g.: url = "https://news.google.com/articles/CBMiX2h0dHBzOi8vd3d3LmZveGJ1c2luZXNzLmNvbS9wb2xpdGljcy9kaXNuZXktc3Vlcy1mbG9yaWRhLWdvdi1yb24tZGVzYW50aXMtbG9zcy1zcGVjaWFsLWRpc3RyaWN00gEA?hl=en-US&gl=US&ceid=US%3Aen"
gbypass_endpoint = "http://selenium_app:80/get_redirection"
# Timeout: 5 minutes
r = requests.post(gbypass_endpoint, json={"url": article_url}, timeout=300)
# Decode
redirect_url = json.loads(r.text).get("redirect_url", "")
except Exception as e:
logger.warning("Exception on request: {}. {}".format(gbypass_endpoint, str(e)))
redirect_url = ""
return redirect_url
logger.debug("Starting gbypass_endpoint()")
article_url_bypassed = None
# Bypass using request
if ("consent.google.com" in article_url):
article_url_bypassed = bypass_google_consent(article_url)
# Not bypassed yet? Bypass using service
if (article_url_bypassed is None):
article_url_bypassed = bypass_google_using_service(article_url)
# if (article_url_bypassed is None) or (article_url_bypassed == "") or ("news.google.com" in article_url_bypassed):
if (article_url_bypassed == "") or (article_url_bypassed is None):
# Empty URL returned by Gbypass
logger.warning("Error while bypassing Gnews for URL: {}".format(article_url))
return None
else:
logger.debug("Correctly bypassed GNews to URL_redirect, from URL: {} {}".format(article_url_bypassed, article_url))
return article_url_bypassed
def process_article(article_url, list_pattern_status_tuple, language="en"):
# TODO:
"""
https://github.com/fhamborg/news-please
https://github.com/fhamborg/Giveme5W1HQwer123$
https://github.com/santhoshse7en/news-fetch
"""
try:
logger.debug("Starting process_article()")
if ("news.google.com" in article_url) or ("consent.google.com" in article_url):
# Bypass to get redirection
article_url = bypass_google_link(article_url)
# Error?
if (article_url is None):
return None, {}, "error"
elif ("missingkids.org/poster" in article_url):
# Get status
article_status, url_canonical = get_missing_kid_status(article_url, return_canonical_url=True)
article_elements = {
"url_full": article_url,
"url_canonical": url_canonical
}
return url_canonical, article_elements, article_status
else:
# Avoid Too many requests (feeds, ...)
time.sleep(0.75)
logger.debug("Processing: {}".format(article_url))
# Default status unless something happens
article_status = "valid"
# Parse article
# TODO: :param proxy: The proxy parameter is a dictionary with a single key-value pair. self._proxy = {'http': proxy, 'https': proxy} if proxy else None
# TODO: Language per config
article = GNews(language).get_full_article(url=article_url)
# Article parsed?
if (article is None) or (not article.is_parsed):
logger.debug("Article not parsed: {}".format(article_url))
return article_url, {}, "error"
# Canonical link as main URL
url_canonical = article.canonical_link
# Empty canonical URL?
if (article.canonical_link is None) or (article.canonical_link == ""):
# URL with parameters? e.g. some zerohedge news fetched from newspaper3k end with #comment-stream -> Remove extra parameter in link
if ("?" in article.url) or (article.url.endswith("#comment-stream")) or (article.url.endswith("#disqus_thread")):
logger.debug("Article URL contains parameters, trying to clean URL: {}".format(article.url))
try:
# Remove text after parameter call
url = article.url.split("?")[0]
# Remove comment-stream
url = url.replace("#comment-stream", "").replace("#disqus_thread", "")
# Article
article_attempt = GNews(language).get_full_article(url=url)
# Retrieving same title? Update article based on clean URL
if (article_attempt is not None) and (article_attempt.title == article.title):
article = article_attempt
except Exception as e:
logger.info("Article parsing of URL without parameters failed: {}".format(article.url))
else: # Default behaviour
logger.debug("Article canonical link is empty, assuming URL=URL_CANONICAL: {}".format(article.url))
# By default, URL same as canonical
url_canonical = article.url
elif (article.url != article.canonical_link):
# If different, stick to canonical URL
logger.debug("Article URL and canonical link are different: {} {}".format(article.url, article.canonical_link))
else:
# If same, continue...
pass
# Update config to determine if content is valid
article.config.MIN_WORD_COUNT = 150
article.config.MIN_SENT_COUNT = 6
# Valid URL?
if (not article.is_valid_url()):
logger.debug("Not a valid news article: {}".format(url_canonical))
article_status = "invalid"
# Is the article's body text is long enough to meet standard article requirements?
if (not article.is_valid_body()):
logger.debug("Article body not valid: {}".format(url_canonical))
article_status = "unknown"
if (article.images != article.imgs):
logger.debug("Article images and imgs are different: {} {}".format(article.images, article.imgs))
# article.keywords, article.meta_keywords, article.summary
# article.movies
# article.top_image
# Check if article status needs to be updated
article_status = get_status_pattern_matching(url_canonical, article_status, list_pattern_status_tuple)
article_elements = {
'url_full': article.url, # https://www.breitbart.com/tech/2022/10/03/report-election-integrity-project-worked-with-feds-to-censor-news-sites-in-2020/
'url_host': get_url_host(article.source_url, url_canonical), # www.breitbart.com
'title': article.title, # Report: Election Integrity Partnership Worked with Feds to Censor News Sites in 2020
'description': article.meta_description, # Coalition committed to respond in early 2022 but failed to do so, while Labor has not issued a full response since taking office
'text': article.text, # ${Article content}
'published_date': get_published_date(article), # python.datetime format, obtained from "YYYY-MM-DD" or '2022-10-03T20:54:17+00:00'
'authors': article.authors, # ['Christopher Knaus']
'language': article.meta_lang, # en
'tags': list(article.tags), # ['Wide Open Border', 'My Son Hunter Movie', ...]
'images': list(article.images), # [URL_IMAGE_1, URL_IMAGE_2, ...]
'url_canonical': url_canonical, # Canonical URL (redirection)
# 'html': article.html, # HTML article
}
logger.debug("Processing OK: {}".format(url_canonical))
return url_canonical, article_elements, article_status
except Exception as e:
logger.warning("Exception processing url: {}\n{}".format(article_url, traceback.format_exc()))
return None, {}, "error"

View File

@@ -0,0 +1,33 @@
def remove_http_s(url):
url = url.replace("https://", "") if url.startswith("https://") else url
url = url.replace("http://", "") if url.startswith("http://") else url
return url
def is_valid_url(url):
if (url.startswith("https://")):
return True
else:
return False
def get_searxng_instances():
# SearxNG instances: https://searx.space/
searx_instances = set()
searx_instances.add("https://searx.work/")
searx_instances.add("https://search.ononoki.org/")
searx_instances.add("https://searxng.nicfab.eu/")
searx_instances.add("https://searx.be/")
# searx_instances.add("https://searx.fmac.xyz/")
# searx_instances.add("https://northboot.xyz/") # FIX
# searx_instances.add("https://serx.ml/") # Offline
# searx_instances.add("https://searx.ru/")
# searx_instances.add("https://searx.sp-codes.de/")
# searx_instances.add("https://searxng.nicfab.eu/")
# searx_instances.add("https://s.frlt.one/")
# searx_instances.add("https://search.sapti.me/")
# To list
list_searx_instances = list(searx_instances)
return list_searx_instances

View File

@@ -0,0 +1,190 @@
from ..models import Urls, UrlContent, UrlsSource, Source, WebsiteToFilter, StatusPatternMatching
from .url_processor import process_url
from django.utils import timezone
from django.core.cache import cache
import hashlib
from datetime import timedelta
import re
import traceback
from .logger import get_logger
logger = get_logger()
class DB_Handler():
def __init__(self):
logger.debug("Initializing URL DB Handler")
def _get_safe_cache_key(self, raw_key):
"""Generate a safe cache key using an MD5 hash"""
return hashlib.md5(raw_key.encode()).hexdigest()
def _cache_key(self, cache_key, cache_timeout=86400):
cache.set(self._get_safe_cache_key(cache_key), True, timeout=cache_timeout)
def _is_cached_key(self, cache_key):
# Returns True if cached
return cache.get(self._get_safe_cache_key(cache_key)) is not None
def insert_raw_urls(self, urls, source):
try:
logger.debug("Inserting raw URLs")
# Empty?
if (len(urls) == 0):
logger.debug("Empty batch of urls (not writing to DB) for source: {}".format(source))
return
url_object_to_insert = []
# Per URL
for url in urls:
### Already processed URL?
if (self._is_cached_key(url)):
logger.debug("Already cached URL: {}".format(url))
if (self._is_cached_key("{}{}".format(source, url))):
logger.debug("Already cached (source, URL): {} {}".format(source, url))
else:
### Insert source
# Get the source (create if not exists)
source_obj, created = Source.objects.get_or_create(source=source)
# Get URL ID
url_obj = Urls.objects.get(url=url)
# Create (id_source, id_url)
UrlsSource.objects.create(id_source=source_obj.id, id_url=url_obj.id)
else:
# Add object to insert
url_object_to_insert.append(Urls(url=url))
### Bulk insert URLs, ignore conflicts if a url exists
bulk_created_obj = Urls.objects.bulk_create(url_object_to_insert, ignore_conflicts=True)
# Insert or update cache
for url in urls:
self._cache_key(url)
self._cache_key("{}{}".format(source, url))
logger.info("Inserted #{} raw URLs".format(len(url_object_to_insert)))
except Exception as e:
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
def _get_status_pattern_matching(self, url, article_status, list_pattern_status_tuple):
# Sort pattern tuples by priority. (pattern, priority, status)
list_pattern_status_tuple.sort(key=lambda tup: tup[1], reverse=True)
# Regex pattern to update status on "valid", "invalid", and "unknown" status only
# Status "raw", "duplicated" and "error" should remain the way they are
# Assumption: List of patterns sorted by importance
if (article_status in ["valid", "invalid", "unknown"]):
# Regular expression pattern matching: https://regexr.com/
for regex_pattern, regex_priority, status_if_match in list_pattern_status_tuple:
# Matching? Update article status
if bool(re.match(regex_pattern, url)):
if (status_if_match != article_status):
logger.debug("Regex pattern found, updating status from '{}' to '{}' for URL: {}".format(article_status, status_if_match, url))
return status_if_match
# Pattern matching not required or not found, original article status
return article_status
def process_raw_urls(self, time_delta=timedelta(days=1), batch_size=50):
try:
logger.debug("Processing raw URLs")
# Get list of domains to filter
list_domains_to_filter = WebsiteToFilter.objects.values_list('url_host', flat=True)
# Get list of (pattern, priority, status) tuples to override status if required
list_pattern_status_tuple = list(StatusPatternMatching.objects.values_list("pattern", "priority", "status"))
# Fetched during last 24 hours
time_delta_ts = timezone.now() - time_delta
# Get batch of URLs, status='raw' and fetched X days ago
raw_urls = Urls.objects.filter(status='raw', ts_fetch__gte=time_delta_ts)[:batch_size]
# List of objects to bulk update
updating_urls = []
# Per URL
for obj_url in raw_urls:
##### Any domain to filter included in URL? -> Invalid
if (any([d in obj_url.url for d in list_domains_to_filter])):
logger.debug("Domain filter applied to input URL: {}".format(obj_url.url))
# Update status
obj_url.status = 'invalid'
# Append to bulk update
updating_urls.append(obj_url)
# Next URL
continue
##### Process URL
try:
# Get data
dict_url_data = process_url(obj_url.url)
# Not none or handle as exception
assert(dict_url_data is not None)
except Exception as e:
logger.debug("Error processing URL: {}\n{}".format(obj_url.url, str(e)))
# Update status
obj_url.status = 'error'
# Append to bulk update
updating_urls.append(obj_url)
# Next URL
continue
##### Canonical URL different? -> Duplicate
if (dict_url_data.get("url") != dict_url_data.get("url_canonical")):
# Update status
obj_url.status = 'duplicate'
# Append to bulk update
updating_urls.append(obj_url)
# Get or create URL with canonical form
obj_url_canonical = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical"))
# Associate same sources to url -> url_canonical
# Get the sources id associated to obj_url.id
url_sources = UrlsSource.objects.filter(id_url=obj_url.id)
for url_source_obj in url_sources:
# Associate same sources to url_canonical (it might already exist)
UrlsSource.objects.get_or_create(id_source=url_source_obj.id_source, id_url=obj_url_canonical.id)
# Next URL
continue
##### Valid URL
# Update status
obj_url.status = 'valid'
# Append to bulk update
updating_urls.append(obj_url)
# Create extracted URL data
UrlContent.objects.create_or_update(
id_url=obj_url.id,
date_published=dict_url_data.get("publish_date"),
title=dict_url_data.get("title"),
description=dict_url_data.get("description"),
content=dict_url_data.get("content"),
valid_content=dict_url_data.get("valid_content"),
language=dict_url_data.get("language"),
keywords=dict_url_data.get("keywords"),
tags=dict_url_data.get("tags"),
authors=dict_url_data.get("authors"),
image_main=dict_url_data.get("image_main"),
images_url=dict_url_data.get("images_url"),
videos_url=dict_url_data.get("videos_url"),
url_host=dict_url_data.get("url_host"),
site_name=dict_url_data.get("site_name"),
)
##### Override status if pattern matching?
for obj_url in updating_urls:
# Check if article status needs to be updated with pattern matching
status_pattern_matching = self._get_status_pattern_matching(obj_url.url, obj_url.status, list_pattern_status_tuple)
# Update status?
if (status_pattern_matching != obj_url.status):
logger.debug("Pattern matching, overriding with status {} for URL: {}".format(status_pattern_matching, obj_url.url))
# Update, no need to append to updating_urls, already included
obj_url.status = status_pattern_matching
# Bulk update
Urls.objects.bulk_update(updating_urls, ['status'])
logger.debug("Finished processing raw URLs")
except Exception as e:
logger.warning("Exception processing raw URLs: {}\n{}".format(e, traceback.format_exc()))

View File

@@ -0,0 +1,50 @@
from .db_utils import DB_Handler
from ..models import Feed
import feedparser
import dateutil
import traceback
from .logger import get_logger
logger = get_logger()
class FetchFeeds():
def __init__(self) -> None:
logger.debug("Initializing News feed")
def run(self):
try:
logger.debug("Starting NewsFeed.run()")
# Get feeds
list_url_feeds = list(Feed.objects.values_list('rss_feed', flat=True))
logger.debug("Fetching news from feeds: {}".format(list_url_feeds))
# Process via RSS feeds
for url_feed in list_url_feeds:
# Initialize
urls_fetched, urls_publish_date = [], []
# Fetch feeds
feeds = feedparser.parse(url_feed)
# Parse
for f in feeds.get("entries", []):
# Get URL
url = f.get("link", None)
# Process?
if (url is not None):
# Available publish date?
publish_date_parsed = f.get("published_parsed")
if (publish_date_parsed is None):
publish_date = f.get("published", None)
if (publish_date is not None):
publish_date_parsed = dateutil.parser.parse(publish_date)
# Published date
urls_publish_date.append(publish_date_parsed)
# URL
urls_fetched.append(url)
# URL fetching source
source = "feed {}".format(url_feed)
# Write to DB
DB_Handler().insert_raw_urls(urls_fetched, source)
except Exception as e:
logger.warning("Exception in NewsFeed.run(): {}\n{}".format(e, traceback.format_exc()))

View File

@@ -0,0 +1,22 @@
import logging
import os
os.makedirs("logs", exist_ok=True)
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
logger = logging.getLogger("news_fetcher")
logger.setLevel(logging.DEBUG)
# To file log: INFO / WARNING / ERROR
fh = logging.handlers.RotatingFileHandler(filename="logs/log_app_fetcher.log", mode="a", maxBytes=10000000, backupCount=4)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
logger.addHandler(fh)
# To file log: WARNING / ERROR
fh_ = logging.handlers.RotatingFileHandler(filename="logs/log_app_fetcher_error.log", mode="a", maxBytes=10000000, backupCount=1)
fh_.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh_.setLevel(logging.WARNING)
logger.addHandler(fh_)
def get_logger():
return logger

View File

@@ -0,0 +1,60 @@
from .logger import get_logger
logger = get_logger()
import newspaper
# pip install langdetect
#import langdetect
#langdetect.DetectorFactory.seed = 0
def process_url(url):
try:
# Process
article = newspaper.article(url)
except newspaper.ArticleException as e:
logger.warning("ArticleException for input URL {}\n{}".format(url, str(e)))
return None
except Exception as e:
logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
return None
dict_data = {
"url": url,
"url_canonical": article.canonical_link,
"url_host": article.source_url,
"site_name": article.meta_site_name,
"publish_date": article.publish_date,
"language": article.meta_lang, # langdetect.detect(article.text)
"title": article.title,
"description": article.meta_description,
"content": article.text,
"valid_content": article.is_valid_body(),
"keywords": [k for k in set(article.keywords + article.meta_keywords) if k!=""],
"tags": article.tags,
"authors": article.authors,
"image_main": article.top_image, # article.meta_img
"images": article.images,
"videos": article.videos,
}
'''
# TODO: If exists, add tags article.meta_data.get("classification-tags", "").split(",")
if (dict_data["tags"] is None):
dict_data["tags"] = []
for k in article.meta_data.keys():
if ("tags" in k):
dict_data["tags"] += article.meta_data[k].split(",")
'''
# Sanity check
for k in dict_data.keys():
if (type(k) is list):
# Remove empty string
dict_data[k] = [ e for e in dict_data[k] if e != "" ]
# NULL instead of empty list
if (len(dict_data[k]) == 0):
dict_data[k] = None
else:
# NULL instead of empty string
if (dict_data[k] == ""):
dict_data[k] = None
return dict_data

View File

@@ -1,13 +1,65 @@
from django_rq import job
import time
from .src.fetch_feed import FetchFeeds
from .src.db_utils import DB_Handler
'''
from src.fetch_parser import FetchParser
from src.fetch_search import FetchSearcher
from src.missing_kids_fetch import MissingKidsFetch
from src.missing_kids_status import MissingKidsStatus
from src.url_status import UpdateErrorURLs
from src.db_utils import DB_Handler
from src.credentials import db_connect_info, redis_connect_info
# DB Handler
db_handler = DB_Handler(db_connect_info, redis_connect_info)
'''
import logging
logger = logging.getLogger(__name__)
@job
def task_1(message):
logger.info("Message: {}".format(message))
def background_task(process_type: str):
logger.info("Task triggered: {}".format(process_type))
try:
time.sleep(5) # Simulate a long-running task
print(f"Task completed: {message}")
FetchFeeds().run()
# DB_Handler().process_raw_urls()
'''
if (process_type == "fetch_feeds"):
FetchFeeds(db_handler).run()
elif (process_type == "fetch_parser"):
FetchParser(db_handler).run()
elif (process_type == "search") or (process_type == "search_full"):
FetchSearcher(cred.db_connect_info, cred.redis_connect_info, full=True).run()
elif (process_type == "search_reduced"):
FetchSearcher(cred.db_connect_info, cred.redis_connect_info, full=False).run()
# Selenium based
elif (process_type == "fetch_missing_kids_reduced"):
MissingKidsFetch(db_handler, num_pages=4).run()
elif (process_type == "fetch_missing_kids_full"):
MissingKidsFetch(db_handler, num_pages=100000).run()
elif (process_type == "update_missing_kids_status_reduced"):
MissingKidsStatus(cred.db_connect_info, cred.redis_connect_info, num_urls=50).update_missing_kids_status()
elif (process_type == "update_missing_kids_status_full"):
MissingKidsStatus(cred.db_connect_info, cred.redis_connect_info, num_urls=None).update_missing_kids_status()
elif (process_type == "update_error_urls"):
UpdateErrorURLs(cred.db_connect_info, cred.redis_connect_info, num_urls=100).update_error_urls_status()
else:
logger.error("Task error, unknown type: {}".format(process_type))
return
'''
logger.info("Task completed: {}".format(process_type))
except Exception as e:
logger.error(e)

View File

@@ -2,5 +2,5 @@ from django.urls import path
from .views import trigger_task
urlpatterns = [
path('trigger_task/', trigger_task, name='trigger_task')
path('fetch', trigger_task, name='trigger_task')
]

View File

@@ -1,10 +1,10 @@
import django_rq
from django.http import JsonResponse
from .tasks import task_1
from .tasks import background_task
def trigger_task(request):
"""View that enqueues a task."""
queue = django_rq.get_queue('default') # Get the default queue
job = queue.enqueue(task_1, "Hello from Django RQ!")
job = queue.enqueue(background_task, "Hello from Django RQ!")
return JsonResponse({"message": "Task has been enqueued!", "job_id": job.id})

View File

@@ -91,6 +91,16 @@ DATABASES = {
}
}
CACHES = {
"default": {
"BACKEND": "django.core.cache.backends.redis.RedisCache",
"LOCATION": "redis://{}:{}".format(
os.environ.get("REDIS_HOST", "localhost"),
os.environ.get("REDIS_PORT", 6379)
),
}
}
RQ_QUEUES = {
'default': {
'HOST': os.environ.get("REDIS_HOST", "localhost"),