Url content

This commit is contained in:
Luciano Gervasoni
2025-03-07 00:34:46 +01:00
parent 4453a51d6d
commit 54ebd58070
66 changed files with 2072 additions and 21 deletions

3
.gitignore vendored Normal file
View File

@@ -0,0 +1,3 @@
__pycache__/
*.pyc
**/credentials.py

View File

@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@@ -11,16 +11,40 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"db_postgres\n",
"\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 0/0\n",
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 1/1\u001b[0m\n",
" \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.2s \u001b[0m\n",
"\u001b[?25h"
]
}
],
"source": [
"!docker rm -f db_postgres; docker compose -f docker/docker-compose.yml up -d ; sleep 10"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"INSERT_TABLES = False\n",
"INSERT_SAMPLE_DATA = False\n",
"INSERT_TABLES = True\n",
"INSERT_SAMPLE_DATA = True\n",
"\n",
"import psycopg\n",
"connection_info = \"host={} port={} user={} password={} dbname={}\".format(\"localhost\", \"5432\", \"supermatitos\", \"supermatitos\", \"matitos\")\n",
"\n",
"from datetime import datetime, timezone\n",
"\n",
"\n",
"if INSERT_TABLES:\n",
" # Connect to an existing database\n",
@@ -87,14 +111,14 @@
" \n",
" \n",
" CREATE TABLE URL_CONTENT (\n",
" id_url INTEGER REFERENCES URLS(id),\n",
" date_published TIMESTAMPTZ NOT NULL DEFAULT NOW(),\n",
" id_url INTEGER PRIMARY KEY REFERENCES URLS(id),\n",
" date_published TIMESTAMPTZ DEFAULT NOW(),\n",
" title TEXT,\n",
" description TEXT,\n",
" content TEXT,\n",
" tags TEXT[],\n",
" authors TEXT[],\n",
" image_urls TEXT[],\n",
" image_urls TEXT[]\n",
" );\n",
" CREATE INDEX idx_tags ON URL_CONTENT USING GIN(tags);\n",
" CREATE INDEX idx_authors ON URL_CONTENT USING GIN(authors);\n",
@@ -119,7 +143,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@@ -137,7 +161,6 @@
" cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.dw.com/en/trauma-how-child-abuse-victims-deal-with-parenthood/a-71833895', 'valid')\")\n",
" cur.execute(\"INSERT INTO URLS (url, status) values ('https://nypost.com/2025/03/06/us-news/colorado-day-care-worker-hit-with-51-charges-of-child-abuse-harassment-for-slapping-toddler/', 'valid')\")\n",
" cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.fox35orlando.com/news/tavares-police-florida-boys-10-9-abused-sheer-brutality', 'valid')\")\n",
" # Invalid\n",
" cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.google.com', 'invalid')\")\n",
"\n",
" cur.execute(\"INSERT INTO SOURCE (source) values ('news.google.com')\")\n",
@@ -162,14 +185,157 @@
" \n",
" # Long URLs \n",
" cur.execute(\"INSERT INTO URLS (url, status) values ('www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf_23dj9sdgj9sdgj8sdf8ds8f.html', 'invalid')\".format(j))\n",
" cur.execute(\"INSERT INTO URLS (url, status) values ('www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf.html', 'invalid')\".format(j))"
" cur.execute(\"INSERT INTO URLS (url, status) values ('www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf.html', 'invalid')\".format(j))\n",
"\n",
" # URL Content\n",
" content = \"Bla Bla Bla!!!\"*25\n",
" cur.execute(\"INSERT INTO URL_CONTENT (id_url, date_published, title, description, content, tags, authors, image_urls) values (%s, %s, 'Mommy blogger turned child abuser', %s, 'Hello there!', %s, %s, %s)\", (1, datetime.now(tz=timezone.utc), content, [\"child abuse\", \"social media\"], [\"Audrey Conklin\"], [\"https://a57.foxnews.com/static.foxnews.com/foxnews.com/content/uploads/2023/08/1440/810/image-58.jpg?ve=1&tl=1\"]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\t urls\n",
"[(1,\n",
" 'https://www.foxnews.com/us/husband-ruby-franke-utah-mommy-blogger-convicted-child-abuse-regrets-wifes-fall-fame',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n",
" (2,\n",
" 'https://www.bbc.com/news/articles/ckg843y8y7no',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n",
" (3,\n",
" 'https://www.wilx.com/2025/03/05/lenawee-county-man-arrested-possessing-child-abuse-material/',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n",
" (4,\n",
" 'https://www.dw.com/en/trauma-how-child-abuse-victims-deal-with-parenthood/a-71833895',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n",
" (5,\n",
" 'https://nypost.com/2025/03/06/us-news/colorado-day-care-worker-hit-with-51-charges-of-child-abuse-harassment-for-slapping-toddler/',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n",
" (6,\n",
" 'https://www.fox35orlando.com/news/tavares-police-florida-boys-10-9-abused-sheer-brutality',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n",
" (7,\n",
" 'https://www.google.com',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (8,\n",
" 'www.super_0.org',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (9,\n",
" 'www.super_1.org',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (10,\n",
" 'www.super_2.org',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (11,\n",
" 'www.super_3.org',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (12,\n",
" 'www.super_4.org',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (13,\n",
" 'www.super_5.org',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (14,\n",
" 'www.super_6.org',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (15,\n",
" 'www.super_7.org',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (16,\n",
" 'www.super_8.org',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (17,\n",
" 'www.super_9.org',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (18,\n",
" 'www.super_10.org',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (19,\n",
" 'www.super_11.org',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (20,\n",
" 'www.super_12.org',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (21,\n",
" 'www.super_13.org',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (22,\n",
" 'www.super_14.org',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (23,\n",
" 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf_23dj9sdgj9sdgj8sdf8ds8f.html',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (24,\n",
" 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf.html',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid')]\n",
"\t urls_duplicate\n",
"[]\n",
"\t feed\n",
"[(1,\n",
" 'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC')]\n",
"\t website_of_interest\n",
"[(1, 'www.unicef.org')]\n",
"\t search\n",
"[(1, 'child abuse')]\n",
"\t urls_source\n",
"[(1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (1, 2), (2, 2), (3, 2)]\n",
"\t source\n",
"[(1, 'news.google.com'), (2, 'qwant.com')]\n",
"\t website_to_filter\n",
"[(1, 'yewtu.be'),\n",
" (2, 'twitter.com'),\n",
" (3, 'libreddit.de'),\n",
" (4, 'youtube.com'),\n",
" (5, 'tiktok.com'),\n",
" (6, 'radio.foxnews.com')]\n",
"\t status_pattern_matching\n",
"[('.*missingkids.org/poster/.*', 50, 'valid')]\n",
"\t url_content\n",
"[(1,\n",
" datetime.datetime(2025, 3, 6, 23, 4, 37, 654130, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'Mommy blogger turned child abuser',\n",
" 'Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla '\n",
" 'Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla '\n",
" 'Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla '\n",
" 'Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla '\n",
" 'Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!',\n",
" 'Hello there!',\n",
" ['child abuse', 'social media'],\n",
" ['Audrey Conklin'],\n",
" ['https://a57.foxnews.com/static.foxnews.com/foxnews.com/content/uploads/2023/08/1440/810/image-58.jpg?ve=1&tl=1'])]\n"
]
}
],
"source": [
"from pprint import pprint\n",
"\n",
@@ -188,8 +354,22 @@
}
],
"metadata": {
"kernelspec": {
"display_name": "matitos",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python"
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.9"
}
},
"nbformat": 4,

View File

@@ -7,9 +7,12 @@ pip install ipykernel django requests ollama psycopg[binary] # openai
# Development
* web_app
* app_web
```
# 1) Change models.py
python manage.py inspectdb
# 2)
python manage.py makemigrations
# 3)
@@ -23,7 +26,7 @@ python manage.py migrate --fake-initial
python manage.py createsuperuser
```
* Image generation
* app_img_gen
```
docker build -t image_generation .
docker run --rm -it -p 12343:80 image_generation

16
app_fetcher/Dockerfile Normal file
View File

@@ -0,0 +1,16 @@
FROM continuumio/miniconda3:23.10.0-1
# App repository
COPY . /opt/app/
RUN conda install -c conda-forge curl
RUN pip install --no-cache-dir --upgrade "psycopg[binary]" git+https://github.com/ranahaani/GNews.git GoogleNews duckduckgo_search newspaper3k numpy beautifulsoup4 requests feedparser pytz redis fastapi uvicorn fastapi-utils lxml[html_clean]
RUN pip freeze
# GoogleNews-1.6.10 Pillow-10.1.0 PyYAML-6.0.1 aiofiles-23.2.1 anyio-3.7.1 beautifulsoup4-4.9.3 bs4-0.0.1 click-8.1.7 cssselect-1.2.0 dateparser-1.2.0 dnspython-1.16.0 duckduckgo_search-3.9.8 fastapi-0.104.1 fastapi-utils-0.2.1 feedfinder2-0.0.4 feedparser-6.0.10 filelock-3.13.1 gnews-0.3.6 greenlet-3.0.1 h11-0.14.0 h2-4.1.0 hpack-4.0.0 httpcore-1.0.2 httpx-0.25.2 hyperframe-6.0.1 jieba3k-0.35.1 joblib-1.3.2 lxml-4.9.3 newspaper3k-0.2.8 nltk-3.8.1 numpy-1.26.2 psycopg-3.1.13 psycopg-binary-3.1.13 pydantic-1.10.13 pymongo-3.12.3 python-dateutil-2.8.2 python-dotenv-0.19.2 pytz-2023.3.post1 redis-5.0.1 regex-2023.10.3 requests-2.26.0 requests-file-1.5.1 sgmllib3k-1.0.0 six-1.16.0 sniffio-1.3.0 socksio-1.0.0 soupsieve-2.5 sqlalchemy-1.4.50 starlette-0.27.0 tinysegmenter-0.3 tldextract-5.1.1 typing-extensions-4.8.0 tzlocal-5.2 uvicorn-0.24.0.post1
WORKDIR /opt/app
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "80"]
# docker build -t fetch_app .
# docker run --rm --name container_fetch_app fetch_app

12
app_fetcher/README.md Normal file
View File

@@ -0,0 +1,12 @@
# Fetcher
* Fetcher app
- Contains several endpoints to perform a specific fetching type task
- For more details, check in [app.py](app.py) /{fetch_type}
* Build and run
- Important: To be deployed with other micro-services, [docker-compose.yml](../docker-compose.yml)
```
docker build -t fetch_app .
docker run --rm --name container_fetch_app fetch_app
```

91
app_fetcher/app.py Normal file
View File

@@ -0,0 +1,91 @@
import src.credentials as cred
import logging
from logging.handlers import RotatingFileHandler
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
logger = logging.getLogger("news_fetcher")
logger.setLevel(logging.INFO)
import os
os.makedirs("logs", exist_ok=True)
# To file log
fh = RotatingFileHandler(filename="logs/log_app_fetcher.log", mode="a", maxBytes=10000000, backupCount=4)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
logger.addHandler(fh)
# To file log: WARNING / ERROR
fh_ = RotatingFileHandler(filename="logs/log_app_fetcher_error.log", mode="a", maxBytes=10000000, backupCount=1)
fh_.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh_.setLevel(logging.WARNING)
logger.addHandler(fh_)
logger.info("Environment: {}".format(cred.ENVIRONMENT))
##################################################################################################
from src.news_feed import NewsFeed
from src.news_parsing import NewsSiteParsing
from src.news_search import NewsSearch
from src.news_missing_kids import NewsMissingKids
from src.missing_kids_status import MissingKidsStatus
from src.url_status import UpdateErrorURLs
from src.fetcher_status import FetcherStatus
from fastapi import FastAPI, BackgroundTasks
# import requests
# from fastapi_utils.tasks import repeat_every
# import time
# time.sleep(10)
# import gc
app = FastAPI()
@app.get("/")
def hello_world():
return {"message": "OK"}
@app.get("/{fetch_type}")
async def fetch(background_tasks: BackgroundTasks, fetch_type: str):
# Concurrent job running
logger.info("Triggered fetch: {}".format(fetch_type))
if (fetch_type == "feeds"):
task_run = NewsFeed(cred.db_connect_info, cred.redis_connect_info).run
elif (fetch_type == "parser"):
task_run = NewsSiteParsing(cred.db_connect_info, cred.redis_connect_info).run
elif (fetch_type == "fetch_missing_kids_reduced"):
task_run = NewsMissingKids(cred.db_connect_info, cred.redis_connect_info, num_pages=4).run
elif (fetch_type == "fetch_missing_kids_full"):
task_run = NewsMissingKids(cred.db_connect_info, cred.redis_connect_info, num_pages=100000).run
elif (fetch_type == "search") or (fetch_type == "search_full"):
task_run = NewsSearch(cred.db_connect_info, cred.redis_connect_info, full=True).run
elif (fetch_type == "search_reduced"):
task_run = NewsSearch(cred.db_connect_info, cred.redis_connect_info, full=False).run
elif (fetch_type == "update_missing_kids_status_reduced"):
task_run = MissingKidsStatus(cred.db_connect_info, cred.redis_connect_info, num_urls=50).update_missing_kids_status
elif (fetch_type == "update_missing_kids_status_full"):
task_run = MissingKidsStatus(cred.db_connect_info, cred.redis_connect_info, num_urls=None).update_missing_kids_status
elif (fetch_type == "update_error_urls"):
task_run = UpdateErrorURLs(cred.db_connect_info, cred.redis_connect_info, num_urls=100).update_error_urls_status
elif (fetch_type == "fetch_warning_check"):
task_run = FetcherStatus(cred.db_connect_info, cred.redis_connect_info, last_minutes_check=180).check_warning
else:
return {"message": "ERROR. Unknown fetcher type!"}
# Run task
background_tasks.add_task(task_run)
# Return message
return {"message": "Started fetching {}: Ok".format(fetch_type)}
##################################################################################################
###########################
'''
@app.on_event("startup")
def verify_db() -> None:
logger.info("Testing DB connection")
import psycopg
with psycopg.connect(cred.db_connect_info) as conn:
url_test_msg = "Num URLs: {}".format(conn.execute("SELECT COUNT(*) FROM URLS;").fetchall())
logger.info(url_test_msg)
'''
###########################

456
app_fetcher/src/db_utils.py Normal file
View File

@@ -0,0 +1,456 @@
import psycopg
import redis
import traceback
import random
import requests
import json
import os
from .url_utils import process_article
import logging
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
logger = logging.getLogger("news_fetcher")
# TODO: URL_DB_HANDLER, _get_search_list, _get_url_host, _get_url_host_list, ...
# The rest, elsewhere
class URL_DB_Writer():
def __init__(self, db_connect_info, redis_connect_info):
logger.debug("Initializing URL DB writer")
self.db_connect_info = db_connect_info
self.redis_instance = redis.Redis(host=redis_connect_info.get("redis_host"), port=redis_connect_info.get("redis_port"))
self.redis_expiry_seconds = redis_connect_info.get("expiry_seconds", 172800) # Default: 48 hours
try:
self.redis_instance.ping()
logger.debug("Succesfully pinged Redis")
except Exception as e:
logger.warning("Error trying to ping Redis: {}".format(str(e)))
def get_urls_count(self, last_minutes_check):
#####################
### Get number of URLs within last X minutes
#####################
try:
# Update
with psycopg.connect(self.db_connect_info) as conn:
# Open cursor
cursor = conn.cursor()
num_urls = cursor.execute("SELECT COUNT(*) FROM URLS WHERE ts_fetch >= current_timestamp - interval '{} minutes';".format(last_minutes_check)).fetchone()[0]
except Exception as e:
logger.warning("Error updating URLs status: {}".format(str(e)))
num_urls = None
return num_urls
def _format(self, values):
# Repalce single quote ' with ''. Based on https://stackoverflow.com/a/12320729
# String -> 'string', Int -> '1' (string-based), None -> NULL (no quotes for pgSQL to interpret Null value)
if (type(values) == list) or (type(values) == tuple):
insert_args = "(" + ", ".join([ "NULL" if v is None else "'" + str(v).replace("'", "''") + "'" for v in values]) + ")"
elif (type(values) == str):
insert_args = "({})".format( "NULL" if values is None else "'" + values.replace("'", "''") + "'" )
else:
logger.warning("Error formatting input values: {}".format(values))
assert False
return insert_args
def _get_cached_canonical_url(self, url):
### Redis: URL processed recently? -> Avoid increasing SERIAL counter & efficiency of DB
try:
filter_url = self.redis_instance.get(url)
if (filter_url is not None):
filter_url = filter_url.decode("utf-8")
except Exception as e:
logger.warning("Exception querying Redis: {}".format(str(e)))
filter_url = None
return filter_url
def _update_urls_status(self, dict_status_ids):
#####################
### Update status to array of URL IDs
#####################
try:
# Update
with psycopg.connect(self.db_connect_info) as conn:
# Open cursor
cursor = conn.cursor()
# Autocommit at end of transaction (Atomic insert of URLs and sources)
with conn.transaction() as tx:
for key_status, value_ids in dict_status_ids.items():
cursor.execute("UPDATE URLS SET status='{}' WHERE id IN ({});".format(key_status, ",".join([str(v) for v in value_ids])))
except Exception as e:
logger.warning("Error updating URLs status: {}".format(str(e)))
def _get_missing_kids_urls(self, num_urls=None):
#####################
### Get list of Missing Kids URLs
#####################
try:
missing_kids_ids_and_urls = []
if (num_urls is None):
limit = 500
else:
limit = num_urls
offset = 0
with psycopg.connect(self.db_connect_info) as conn:
# Open cursor
cursor = conn.cursor()
while True:
# Query
missing_kids_ids_and_urls_query = cursor.execute("SELECT id, url, status FROM URLS WHERE url LIKE '%missingkids.org/poster%' ORDER BY ts_fetch DESC LIMIT {} OFFSET {};".format(limit, offset)).fetchall()
# Finished?
if (len(missing_kids_ids_and_urls_query) == 0):
break
# Extend
missing_kids_ids_and_urls = missing_kids_ids_and_urls + missing_kids_ids_and_urls_query
# Offset
offset += len(missing_kids_ids_and_urls_query)
# Stop?
if (num_urls is not None) and (len(missing_kids_ids_and_urls) >= num_urls):
break
except Exception as e:
logger.warning("Error getting Missing Kids URLs: {}".format(str(e)))
missing_kids_ids_and_urls = []
return missing_kids_ids_and_urls
def _get_error_urls(self, num_urls=None):
#####################
### Get list of Missing Kids URLs
#####################
try:
error_urls = []
if (num_urls is None):
limit = 500
else:
limit = num_urls
offset = 0
with psycopg.connect(self.db_connect_info) as conn:
# Open cursor
cursor = conn.cursor()
while True:
# Query
error_urls_query = cursor.execute("SELECT id, url FROM URLS WHERE status='error' ORDER BY ts_fetch DESC LIMIT {} OFFSET {};".format(limit, offset)).fetchall()
# Finished?
if (len(error_urls_query) == 0):
break
# Extend
error_urls = error_urls + error_urls_query
# Offset
offset += len(error_urls_query)
# Stop?
if (num_urls is not None) and (len(error_urls) >= num_urls):
break
except Exception as e:
logger.warning("Error getting Error URLs: {}".format(str(e)))
error_urls = []
return error_urls
def _decode_urls(self, urls_fetched, list_domains_to_filter, list_pattern_status_tuple): # TODO: language for urls_fetched...
"""
# TODO: REFACTOR
For each input url
Already processed?
-> Update on Redis expire time
-> Associate to source
Not processed? Get main URL:
-> URL Canonical valid?
-> Rely on this as main URL
-> URL Canonical not valid?
-> Use input url, unless it's a news.google.com link
-> If news.google.com link, filter out. REDIS?
Main URL processing:
-> Update in REDIS, association url -> url_canonical
-> url != url_canonical: Add in duplicate table
If both != news.google.com
"""
# URLs to insert, URLs duplicated association, URL to Canonical form
list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical = [], [], {}
# URL VS CANONICAL:
# News URL returned: https://news.google.com/articles/CBMifmh0dHBzOi8vd3d3LmJyZWl0YmFydC5jb20vMm5kLWFtZW5kbWVudC8yMDIzLzA0LzAzL2dvdi1kZXNhbnRpcy1zaWducy1iaWxsLW1ha2luZy1mbG9yaWRhLXRoZS0yNnRoLWNvbnN0aXR1dGlvbmFsLWNhcnJ5LXN0YXRlL9IBAA?hl=en-US&gl=US&ceid=US%3Aen
# Corresponds to canonical URL: https://www.breitbart.com/2nd-amendment/2023/04/03/gov-desantis-signs-bill-making-florida-the-26th-constitutional-carry-state/
for url in urls_fetched:
# Domain to filter? Input url
filter_due_to_domain = False
for domain_to_filter in list_domains_to_filter:
if (domain_to_filter in url):
logger.debug("Domain filter applied based on {} for input URL: {}".format(domain_to_filter, url))
filter_due_to_domain = True
if (filter_due_to_domain):
continue
# URL processed recently? -> Filter and avoid increasing SERIAL counter & efficiency of DB
cached_canonical_url = self._get_cached_canonical_url(url)
if (cached_canonical_url is not None):
# Even if url processed, need to add url_canonical to list_filtered_urls, so as to associate search source to canonical URL (canonical is the main URL entry)
dict_full_urls_to_canonical[url] = cached_canonical_url # X -> Y
# If url has been processed, so was its canonical form
logger.debug("Filtering out already inserted (processed) URL and its canonical form: {} {}".format(url, cached_canonical_url))
continue
# Process TODO: Add language...
url_canonical, article_elements, article_status = process_article(url, list_pattern_status_tuple)
# TODO: Store article_elements information to insert into OS after inserted into DB (and therefore having associated url_id)
# Could not retrieve redirection for news.google.com based URL? Continue (avoid inserting in DB)
if (url_canonical is None) and ("news.google.com" in url):
logger.debug("Filtering empty canonical link for base URL based on news.google.com: {}".format(url))
continue
# Canonical URL still news.google.com? Continue (avoid inserting in DB)
if (url_canonical is not None) and ("news.google.com" in url_canonical):
logger.debug("Filtering canonical news.google.com based URL: {}".format(url_canonical))
continue
# Domain to filter? Input canonical_url
filter_due_to_domain = False
for domain_to_filter in list_domains_to_filter:
if (url_canonical is not None) and (domain_to_filter in url_canonical):
filter_due_to_domain = True
if (filter_due_to_domain):
logger.info("Filtering due to domain input URL, Canonical_URL: {} {}".format(url, url_canonical))
continue
if (url_canonical is None) or (article_status == "error"):
logger.debug("Processing failed for URL: {}".format(url))
# Still insert URL with "error"? -> If processed later, might have inconsistent sources (url vs url_canonical). Only store if not news.google.com based
if ("news.google.com" in url) or ("consent.google.com" in url):
logging.debug("Not able to process Google News link, skipping: {}".format(url))
else:
dict_full_urls_to_canonical[url] = url # X -> X
list_insert_url_tuple_args.append( (url, article_status) )
continue
# URL was not processed (not sure canonical yet). Generate URL_CANONICAL <-> URL_ORIGINAL association if they're different
if (url_canonical != url):
list_tuple_canonical_duplicate_urls.append( (url_canonical, url) )
# Dict: url -> canonical (update association)
dict_full_urls_to_canonical[url] = url_canonical # X -> Y or X
# Canonical URL processed recently? -> Filter and avoid increasing SERIAL counter & efficiency of DB
if (self._get_cached_canonical_url(url_canonical) is not None):
# Canonical URL was already processed
logger.debug("Filtering out already inserted (processed) URL canonical: {}".format(url_canonical))
else:
# Insert url_canonical to DB formatted
list_insert_url_tuple_args.append( (url_canonical, article_status) )
# Canonical URL different? Process
if (url_canonical != url):
if ("news.google.com" in url) or ("consent.google.com" in url):
logging.debug("Not adding google.news.com based link, skipping: {}".format(url))
else:
# Fetched url -> duplicate (using canonical as main link)
article_status = "duplicate"
# Insert url (non-canonical) to DB formatted
list_insert_url_tuple_args.append( (url, article_status) )
return list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical
def _insert_urls(self, cursor, list_insert_url_tuple_args):
#####################
### Insert URLs with status
#####################
if (len(list_insert_url_tuple_args) > 0):
insert_args = ', '.join( [ self._format(t) for t in list_insert_url_tuple_args] )
# Insert. (url_1, status_1), (url_2, status_2), ...
sql_code = "INSERT INTO URLS {} VALUES {} ON CONFLICT (url) DO NOTHING;".format("(url, status)", insert_args)
# logger.debug("SQL CODE: {}".format(sql_code))
c = cursor.execute(sql_code)
# NOTE: Not using "RETURNING id" since previously inserted URLs are not returned (ON CONFLICT)
# https://stackoverflow.com/questions/35949877/how-to-include-excluded-rows-in-returning-from-insert-on-conflict/35953488#35953488
def _insert_urls_duplicated(self, cursor, list_tuple_canonical_duplicate_urls):
#####################
### Insert duplicated URLs
#####################
if (len(list_tuple_canonical_duplicate_urls) > 0):
# Flatten, format, set to remove duplicates
args_duplicated_urls_set = "(" + ', '.join( set( [ "'" + str(y).replace("'", "''") + "'" for x in list_tuple_canonical_duplicate_urls for y in x] ) ) + ")"
# Dict: url -> id
dict_url_to_id = {}
# Get url -> id association to populate duplicated URLs
for (id_, url_) in cursor.execute("SELECT id, url FROM URLS WHERE url IN {};".format(args_duplicated_urls_set)).fetchall():
dict_url_to_id[url_] = id_
# Convert tuples (url_canonical, url) -> (id_url_canonical, id_url) to insert in DB
# ORIGINAL CODE. Issue, might not have found association to all urls
### list_tuple_canonical_duplicate_urls_ids = [ (dict_url_to_id[t[0]], dict_url_to_id[t[1]]) for t in list_tuple_canonical_duplicate_urls]
list_tuple_canonical_duplicate_urls_ids = []
for (url_1, url_2) in list_tuple_canonical_duplicate_urls:
id_url_1, id_url_2 = dict_url_to_id.get(url_1), dict_url_to_id.get(url_2)
if (id_url_1 is None) or (id_url_2 is None):
logger.debug("Skipping duplicate association due to no url -> id_url mapping available for tuple: {} {}".format(url_1, url_2))
else:
list_tuple_canonical_duplicate_urls_ids.append( (id_url_1, id_url_2) )
if (len(list_tuple_canonical_duplicate_urls_ids) > 0):
insert_args = ', '.join( [ self._format(t) for t in list_tuple_canonical_duplicate_urls_ids] )
# Insert. (id_url_canonical_1, id_url_1), ...
sql_code = "INSERT INTO URLS_DUPLICATE {} VALUES {} ON CONFLICT DO NOTHING;".format("(id_url_canonical, id_url_duplicated)", insert_args)
# logger.debug("SQL CODE: {}".format(sql_code))
c = cursor.execute(sql_code)
def _get_pattern_status_list(self):
#####################
### Get list of domains to filter
#####################
# TODO: Cache on redis and query once every N hours? ...
try:
with psycopg.connect(self.db_connect_info) as conn:
# Open cursor
cursor = conn.cursor()
# TODO: Cache on Redis
list_pattern_status = cursor.execute("SELECT pattern, priority, status FROM STATUS_PATTERN_MATCHING;").fetchall()
except Exception as e:
logger.warning("Error getting pattern status list: {}".format(str(e)))
list_pattern_status = []
return list_pattern_status
def _get_domains_to_filter(self):
#####################
### Get list of domains to filter
#####################
# TODO: Cache on redis and query once every N hours? ...
try:
with psycopg.connect(self.db_connect_info) as conn:
# Open cursor
cursor = conn.cursor()
# TODO: Cache on Redis
sites_to_filter = [e[0] for e in cursor.execute("SELECT url_host FROM WEBSITE_TO_FILTER;").fetchall() ]
except Exception as e:
logger.warning("Error getting domains to filter: {}".format(str(e)))
sites_to_filter = []
return sites_to_filter
def _get_cached_source_id(self, source):
### Redis: URL processed recently? -> Avoid increasing SERIAL counter & efficiency of DB
try:
source_id = self.redis_instance.get(source)
if (source_id is not None):
source_id = source_id.decode("utf-8")
except Exception as e:
logger.warning("Exception querying Redis: {}".format(str(e)))
source_id = None
return source_id
def _get_source_id(self, cursor, source):
#####################
### Get source corresponding id
#####################
# Cached?
id_source = self._get_cached_source_id(source)
if (id_source is None):
c = cursor.execute("SELECT id FROM SOURCE WHERE source='{}'".format(source.replace("'", "''"))).fetchone()
if (c is None) or (len(c) == 0):
# Source does not exist, insert and get id
c = cursor.execute("INSERT INTO SOURCE (source) VALUES ('{}') RETURNING id;".format(source.replace("'", "''"))).fetchone()
# Decode source id
id_source = c[0]
# Cache
self.redis_instance.set(source, id_source, ex=self.redis_expiry_seconds)
return id_source
def _get_urls_id(self, cursor, urls_full):
#####################
### Get id of inserted and filtered URLs
#####################
# TODO: Cache url -> url_id, url_canonical
if (len(urls_full) == 0):
return []
# Get inserted and filtered URL ids (unnested). Filtered URLs are also retrieved since they might have been fetched from a new source
in_inserted_filtered_urls = "(" + ', '.join(["'" + u.replace("'", "''") + "'" for u in urls_full]) + ")"
id_urls_related = [ i[0] for i in cursor.execute("SELECT id FROM URLS WHERE url IN {};".format(in_inserted_filtered_urls)).fetchall() ]
return id_urls_related
def _insert_urls_source(self, cursor, id_urls_related, id_source):
#####################
### Insert URL sources: (id_url_1, id_source), (id_url_2, id_source), ...
#####################
if (len(id_urls_related) == 0) or (id_source is None):
return
columns = "(id_url, id_source)"
insert_args = ', '.join( [ self._format([id_url, id_source]) for id_url in id_urls_related ] )
# Insert
sql_code = "INSERT INTO URLS_SOURCE {} VALUES {} ON CONFLICT DO NOTHING;".format(columns, insert_args)
# logger.debug("SQL CODE: {}".format(sql_code))
c = cursor.execute(sql_code)
def write_batch(self, urls_fetched, source):
# Chunks of 50 elements
n = 50
# Divide in small chunks
urls_fetched_chunks = [urls_fetched[i:i + n] for i in range(0, len(urls_fetched), n)]
# Process
for urls_fetched_chunk_i in urls_fetched_chunks:
self._write_small_batch(urls_fetched_chunk_i, source)
def _write_small_batch(self, urls_fetched, source):
try:
logger.info("Fetched #{} URLs, source: {}".format(len(urls_fetched), source))
if (len(urls_fetched) == 0):
logger.debug("Empty batch of urls (not writing to DB) for source: {}".format(source))
return
# Shuffle URLs to reduce continuous URLs of same URL host (minimize chance of being blocked for too many continuous requests)
random.shuffle(urls_fetched)
# Get list of domains to filter
list_domains_to_filter = self._get_domains_to_filter()
# Get list of (pattern, priority, status) tuples to override status if required
list_pattern_status_tuple = self._get_pattern_status_list()
# Sort pattern tuples by priority
list_pattern_status_tuple.sort(key=lambda tup: tup[1], reverse=True)
# Process URLs to update DB
list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical = self._decode_urls(urls_fetched, list_domains_to_filter, list_pattern_status_tuple)
# Full set of URL and its canonical form (to associate them to a search), both to insert and filter
urls_full = set(dict_full_urls_to_canonical.keys()).union( set(dict_full_urls_to_canonical.values()) )
# Insert
with psycopg.connect(self.db_connect_info) as conn:
# Open cursor
cursor = conn.cursor()
# Autocommit at end of transaction (Atomic insert of URLs and sources)
with conn.transaction() as tx:
# Insert processed URLs
self._insert_urls(cursor, list_insert_url_tuple_args)
# Insert URLs duplicated (canonical != fetched url)
self._insert_urls_duplicated(cursor, list_tuple_canonical_duplicate_urls)
# Get source id in DB
id_source = self._get_source_id(cursor, source)
# Get IDs of all related URLs
id_urls_related = self._get_urls_id(cursor, urls_full)
# Insert search source associated to URLs
self._insert_urls_source(cursor, id_urls_related, id_source)
# Update Redis status of inserted and filtered URLs after writing to DB
for url, url_canonical in dict_full_urls_to_canonical.items():
try:
# Set with updated expiry time
self.redis_instance.set(url, url_canonical, ex=self.redis_expiry_seconds)
if (url != url_canonical):
self.redis_instance.set(url_canonical, url_canonical, ex=self.redis_expiry_seconds)
except Exception as e:
logger.warning("Exception running set in Redis: {}".format(str(e)))
if (len(list_insert_url_tuple_args) > 0):
try:
webhook_token = os.environ.get("CLIQ_WEBHOOK_TOKEN")
endpoint_message = "https://cliq.zoho.com/api/v2/channelsbyname/urlretrievalbot/message?zapikey={}".format(webhook_token)
payload = json.dumps({"text": "Fetched #{} new URLs, source: {}".format(len(list_insert_url_tuple_args), source) })
r = requests.post(endpoint_message, data=payload)
except Exception as e:
logger.warning("Webhook failed: {}".format(str(e)))
logger.debug("URL DB write finished")
except Exception as e:
logger.warning( "Exception writing to URL_DB:\n{}".format(traceback.format_exc()) )
logger.debug( "Exception --- List of URLs: {}".format(str(urls_fetched)) )

View File

@@ -0,0 +1,39 @@
from .db_utils import URL_DB_Writer
import json
import logging
import requests
import os
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
logger = logging.getLogger("news_fetcher")
class FetcherStatus():
def __init__(self, db_connect_info, redis_connect_info, last_minutes_check) -> None:
self.db_connect_info = db_connect_info
self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info)
self.last_minutes_check = last_minutes_check
def check_warning(self):
try:
logger.info("Starting fetcher check for last minutes {}".format(self.last_minutes_check))
# Get number of URLs
num_urls = self.db_writer.get_urls_count(last_minutes_check=self.last_minutes_check)
logger.debug("Fetched #URLs {} during the last {} minutes".format(num_urls, self.last_minutes_check))
webhook_token = os.environ.get("CLIQ_WEBHOOK_TOKEN")
endpoint_message = "https://cliq.zoho.com/api/v2/channelsbyname/urlfetchwarnings/message?zapikey={}".format(webhook_token)
if (num_urls is None):
try:
payload = json.dumps({"text": "[WARNING] Error on query to DB"})
r = requests.post(endpoint_message, data=payload)
except Exception as e:
logger.warning("Webhook failed: {}".format(str(e)))
elif (num_urls == 0):
try:
payload = json.dumps({"text": "[WARNING] No URLs fetched for {} minutes".format(self.last_minutes_check) })
r = requests.post(endpoint_message, data=payload)
except Exception as e:
logger.warning("Webhook failed: {}".format(str(e)))
except Exception as e:
logger.warning("Exception in UpdateErrorURLs.run(): {}".format(str(e)))

View File

@@ -0,0 +1,27 @@
import requests
import json
import logging
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
logger = logging.getLogger("news_fetcher")
class GoogleByPass():
def __init__(self) -> None:
pass
def bypass_google_urls(self, list_urls):
if (len(list_urls) == 0):
return []
try:
# Endpoint
gbypass_endpoint = "http://selenium_app:80/get_redirection"
# Timeout: 20 minutes
timeout = 60*20
r = requests.post(gbypass_endpoint, json={"list_urls": list_urls}, timeout=timeout)
# Decode
list_urls_redirections = json.loads(r.text).get("list_urls_redirections", [])
except Exception as e:
logger.warning("Exception on request: {}. {}".format(gbypass_endpoint, str(e)))
list_urls_redirections = []
return list_urls_redirections

View File

@@ -0,0 +1,69 @@
import requests
from .db_utils import URL_DB_Writer
from .url_utils import get_missing_kid_status
import time
import logging
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
logger = logging.getLogger("news_fetcher")
class MissingKidsStatus():
def __init__(self, db_connect_info, redis_connect_info, num_urls) -> None:
self.num_urls = num_urls
self.db_connect_info = db_connect_info
self.redis_connect_info = redis_connect_info
self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info)
def update_missing_kids_status(self):
try:
logger.info("Starting updating status to Missing Kids URLs, limit #URLs: {}".format(self.num_urls))
# List of URLs
list_ids_and_urls = self.db_writer._get_missing_kids_urls(self.num_urls)
# Dict: status -> IDs to update to new status
dict_status_ids, dict_status_urls = {}, {}
# Check URLs with invalid status?
skip_invalid_check = False
flush_every, flush_current = 20, 0
# Iterate URLs
for (id, url, current_status) in list_ids_and_urls:
# Skip duplicate URLs
if (current_status == "duplicate"):
continue
# Skip invalid URLs?
if (skip_invalid_check):
if (current_status == "invalid"):
continue
# Get status
new_status = get_missing_kid_status(url)
# Different? Update
if (current_status != new_status):
# Extend array
dict_status_ids[new_status] = dict_status_ids.get(new_status, []) + [id]
# Debugging dict
dict_status_urls[new_status] = dict_status_urls.get(new_status, []) + [url]
# +1 processed
flush_current += 1
# Flush batch?
if (flush_every == flush_current):
logger.info("Updating status to Missing Kids URLs: {}".format(dict_status_urls))
# Update DB
self.db_writer._update_urls_status(dict_status_ids)
# Reset
flush_current = 0
dict_status_ids, dict_status_urls = {}, {}
# Flush remaining batch
if (flush_current > 0):
logger.info("Updating status to Missing Kids URLs: {}".format(dict_status_urls))
# Update DB
self.db_writer._update_urls_status(dict_status_ids)
# Reset
flush_current = 0
dict_status_ids, dict_status_urls = {}, {}
logger.info("Finished updating status to Missing Kids URLs")
except Exception as e:
logger.warning("Exception in MissingKidsStatus.run(): {}".format(str(e)))

View File

@@ -0,0 +1,60 @@
from .db_utils import URL_DB_Writer
import feedparser
import dateutil
import psycopg
import logging
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
logger = logging.getLogger("news_fetcher")
class NewsFeed():
def __init__(self, db_connect_info, redis_connect_info) -> None:
logger.debug("Initializing News feed")
self.db_connect_info = db_connect_info
self.redis_connect_info = redis_connect_info
def _get_feed_urls(self):
try:
with psycopg.connect(self.db_connect_info) as conn:
list_url_feeds = conn.execute("SELECT rss_feed FROM FEED;").fetchall()
# Decode (tuple with 1 element)
list_url_feeds = [l[0] for l in list_url_feeds]
except Exception as e:
logger.warning("Exception fetching RSS sites: " + str(e))
list_url_feeds = []
return list_url_feeds
def run(self):
try:
logger.debug("Starting NewsFeed.run()")
# Get feeds
list_url_feeds = self._get_feed_urls()
logger.debug("Fetching news from feeds: {}".format(str(list_url_feeds)))
# Process via RSS feeds
for url_feed in list_url_feeds:
# Initialize
urls_fetched, urls_publish_date = [], []
# Fetch feeds
feeds = feedparser.parse(url_feed)
# Parse
for f in feeds.get("entries", []):
# Get URL
url = f.get("link", None)
# Process?
if (url is not None):
# Available publish date?
publish_date = f.get("published", None)
if (publish_date is not None):
publish_date = dateutil.parser.parse(publish_date)
urls_publish_date.append(publish_date)
# URL
urls_fetched.append(url)
# URL fetching source
source = "feed {}".format(url_feed)
# Write to DB
db_writer = URL_DB_Writer(self.db_connect_info, self.redis_connect_info)
db_writer.write_batch(urls_fetched, source)
except Exception as e:
logger.warning("Exception in NewsFeed.run(): {}".format(str(e)))

View File

@@ -0,0 +1,40 @@
from .db_utils import URL_DB_Writer
import requests
import json
import logging
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
logger = logging.getLogger("news_fetcher")
class NewsMissingKids():
def __init__(self, db_connect_info, redis_connect_info, num_pages) -> None:
logger.debug("Initializing News MissingKids")
self.db_connect_info = db_connect_info
self.redis_connect_info = redis_connect_info
self.num_pages = num_pages
def run(self):
try:
logger.debug("Starting NewsMissingKids.run()")
try:
# Missing kids fetching endpoint, parameter number of pages to fetch
missingkids_fetch_endpoint = "http://selenium_app:80/get_missing_kids/?pages={}".format(self.num_pages)
# Timeout
if (self.num_pages > 15):
timeout = 60*90 # 1.5h
else:
timeout = 60*5 # 5 min
# Request
r = requests.get(missingkids_fetch_endpoint, timeout=timeout)
# Decode
urls_fetched = json.loads(r.text).get("list_urls", [])
except Exception as e:
logger.warning("Timeout on request: {}. {}".format(missingkids_fetch_endpoint, str(e)))
urls_fetched = []
# URL fetching source
source = "missingkids fetcher"
# Write to DB
db_writer = URL_DB_Writer(self.db_connect_info, self.redis_connect_info)
db_writer.write_batch(urls_fetched, source)
except Exception as e:
logger.warning("Exception in NewsMissingKids.run(): {}".format(str(e)))

View File

@@ -0,0 +1,58 @@
from .db_utils import URL_DB_Writer
import newspaper
import psycopg
import logging
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
logger = logging.getLogger("news_fetcher")
class NewsSiteParsing():
def __init__(self, db_connect_info, redis_connect_info) -> None:
logger.debug("Initializing News SiteParsing newspaper3k")
self.db_connect_info = db_connect_info
self.redis_connect_info = redis_connect_info
def _get_url_hosts(self):
try:
with psycopg.connect(self.db_connect_info) as conn:
list_url_hosts = conn.execute("SELECT url_host FROM WEBSITE_OF_INTEREST;").fetchall()
# Decode (tuple with 1 element)
list_url_hosts = [l[0] for l in list_url_hosts]
except Exception as e:
logger.warning("Exception fetching RSS sites: " + str(e))
list_url_hosts = []
return list_url_hosts
def _postprocess(self, article_urls):
return [url.replace("#comment-stream", "") for url in article_urls]
def run(self):
try:
logger.debug("Starting NewsSiteParsing.run() for {}")
# Get feeds
list_url_hosts = self._get_url_hosts()
logger.info("Fetching news by parsing URL hosts: {}".format(str(list_url_hosts)))
# Process newspaper3k build method
for url_host_feed in list_url_hosts:
# Protocol
if not (url_host_feed.startswith("http")):
url_host_feed_formatted = "https://" + url_host_feed
else:
url_host_feed_formatted = url_host_feed
logger.debug("Fetching newspaper3k parsing based on URL: {}".format(url_host_feed_formatted))
# Source object
url_host_built = newspaper.build(url_host_feed_formatted)
# Get articles URL list
urls_fetched = url_host_built.article_urls()
# Post-processing
urls_fetched = self._postprocess(urls_fetched)
# URL fetching source
source = "newspaper3k {}".format(url_host_feed)
# Write to DB
db_writer = URL_DB_Writer(self.db_connect_info, self.redis_connect_info)
db_writer.write_batch(urls_fetched, source)
except Exception as e:
logger.warning("Exception in NewsSiteParsing.run(): {}".format(str(e)))

View File

@@ -0,0 +1,181 @@
from .db_utils import URL_DB_Writer
import psycopg
from .utils import get_searxng_instances
from .search_sources import FetcherDuckDuckGo, FetcherGNews, FetcherGoogleNews, FetcherSearxNews, FetcherPreSearch
from threading import Thread
import time
import random
import logging
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
logger = logging.getLogger("news_fetcher")
class NewsSearch():
def __init__(self, db_connect_info, redis_connect_info, full=True) -> None:
logger.debug("Initializing News feed")
self.db_connect_info = db_connect_info
self.redis_connect_info = redis_connect_info
self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info)
self.full_search = full
def _get_url_host_list(self):
try:
with psycopg.connect(self.db_connect_info) as conn:
# List of URL host
list_url_host = [l[0] for l in conn.execute("SELECT url_host FROM WEBSITE_OF_INTEREST;").fetchall()]
# Clean http / https from URLs
list_url_host = [l.replace("https://", "").replace("http://", "") for l in list_url_host]
# Clean last slash if exists
list_url_host = [ l if not l.endswith("/") else l[:-1] for l in list_url_host]
except Exception as e:
logger.warning("Exception fetching URL host list: " + str(e))
list_url_host = []
return list_url_host
def _get_search_list(self):
try:
with psycopg.connect(self.db_connect_info) as conn:
# List of keyword searches
list_search_text = [l[0] for l in conn.execute("SELECT keyword_search FROM SEARCH;").fetchall()]
except Exception as e:
logger.warning("Exception fetching searches list: " + str(e))
list_search_text = []
return list_search_text
def _run_fetching(self, search_text):
logger.debug("Starting _run_fetching() for {}".format(search_text))
# Initialize DB Writer
db_writer = URL_DB_Writer(self.db_connect_info, self.redis_connect_info)
# Common parameters
lang, region = "en", "US"
### PreSearch
dict_params_news = {"search": search_text}
FetcherPreSearch(**dict_params_news).fetch_articles(db_writer)
### DuckDuckGo
period = "d"
dict_params_news = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "news", "period": period}
FetcherDuckDuckGo(**dict_params_news).fetch_articles(db_writer)
dict_params_general = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "general", "period": period}
FetcherDuckDuckGo(**dict_params_general).fetch_articles(db_writer)
if (self.full_search):
# Avoid site:{} search due to G-Bypass required time
if ("site:" not in search_text):
### GNews
dict_params = {"search": search_text, "lang": "wt", "region": "wt", "period": period}
FetcherGNews(**dict_params).fetch_articles(db_writer)
### GoogleNews
dict_params_news = {"search": search_text, "lang": lang, "region": region, "search_category": "news", "period": period}
FetcherGoogleNews(**dict_params_news).fetch_articles(db_writer)
# dict_params_general = {"search": search_text, "lang": lang, "region": region, "search_category": "general", "period": period}
'''
# Method run concurrently, minimize overlapping
time.sleep(random.uniform(1, 15))
list_threads = []
def run_search(FetcherObject, dict_params):
# Initialize DB Writer
db_writer = URL_DB_Writer(self.db_connect_info, self.redis_connect_info)
# Fetch and write to DB
FetcherObject(**dict_params).fetch_articles(db_writer)
"""
### SearxNG
period = "day"
for searx_instance in get_searxng_instances():
dict_params_news = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "news", "period": period}
dict_params_general = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "general", "period": period}
# Append thread
list_threads.append( Thread(target=run_search, args=(FetcherSearxNews, dict_params_news, )) )
list_threads.append( Thread(target=run_search, args=(FetcherSearxNews, dict_params_general, )) )
"""
### PreSearch
dict_params_news = {"search": search_text}
list_threads.append( Thread(target=run_search, args=(FetcherPreSearch, dict_params_news, )) )
### DuckDuckGo
period = "d"
dict_params_news = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "news", "period": period}
dict_params_general = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "general", "period": period}
# Append thread
list_threads.append( Thread(target=run_search, args=(FetcherDuckDuckGo, dict_params_news, )) )
list_threads.append( Thread(target=run_search, args=(FetcherDuckDuckGo, dict_params_general, )) )
if (self.full_search):
# Avoid site:{} search due to G-Bypass required time
if ("site:" not in search_text):
### GNews
for period in ["1d"]: # ["1d", "6h"]:
dict_params = {"search": search_text, "lang": "wt", "region": "wt", "period": period}
# Append thread
list_threads.append( Thread(target=run_search, args=(FetcherGNews, dict_params, )) )
### GoogleNews
for period in ["1d"]: # ["1d", "6h"]:
# News
dict_params_news = {"search": search_text, "lang": lang, "region": region, "search_category": "news", "period": period}
list_threads.append( Thread(target=run_search, args=(FetcherGoogleNews, dict_params_news, )) )
if False:
dict_params_general = {"search": search_text, "lang": lang, "region": region, "search_category": "general", "period": period}
list_threads.append( Thread(target=run_search, args=(FetcherGoogleNews, dict_params_general, )) )
# Run
MULTITHREADED = False
logger.debug("Fetching threads starting")
if MULTITHREADED:
for t in list_threads:
t.start()
# Join
for t in list_threads:
t.join()
else:
for t in list_threads:
t.start()
t.join()
logger.debug("Fetching threads finished")
'''
logger.debug("Finished _run_fetching()")
def run(self):
try:
logger.info("Fetching text searches & URL hosts of interest")
# Get text searches of interest
list_search_text_of_interest = self._get_search_list()
# Get URL host of interest
list_url_host = self._get_url_host_list()
# Get text searches for URL hosts
list_search_text_url_host = ["site:{}".format(l) for l in list_url_host]
MULTITHREADED = False
if MULTITHREADED:
# Run fetching
list_fetching_threads = []
for search_text in list_search_text_of_interest + list_search_text_url_host:
logger.debug("Fetching news for search: {}".format(search_text))
# Append thread
list_fetching_threads.append( Thread(target=self._run_fetching, args=(search_text, )) )
# Run
for t in list_fetching_threads:
t.start()
# Join
for t in list_fetching_threads:
t.join()
else:
for search_text in list_search_text_of_interest + list_search_text_url_host:
logger.debug("Fetching news for search: {}".format(search_text))
self._run_fetching(search_text)
logger.info("Finished fetching text searches & URL hosts of interest")
except Exception as e:
logger.warning("Exception in NewsSearch.run(): {}".format(str(e)))

View File

@@ -0,0 +1,321 @@
from duckduckgo_search import DDGS
from gnews import GNews
from GoogleNews import GoogleNews
import requests
from bs4 import BeautifulSoup
import os
import time
import json
import numpy as np
import random
from .user_agents import user_agents_list
from .google_bypass import GoogleByPass
from abc import ABC, abstractmethod
import logging
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
logger = logging.getLogger("news_fetcher")
# Generic fetcher (fetches articles, writes to DB)
class FetcherAbstract(ABC):
@abstractmethod
def _fetch(self):
pass
def fetch_articles(self, db_writer):
logger.debug("Starting fetch() for {}".format(self.name))
# Fetch articles
list_news = self._fetch()
logger.info("Found #{} articles for search: {}".format(len(list_news), self.name))
# Write to DB
db_writer.write_batch(list_news, self.name)
class FetcherPreSearch(FetcherAbstract):
def __init__(self, search):
"""
# period ->
- h = hours (eg: 12h)
- d = days (eg: 7d)
- m = months (eg: 6m)
- y = years (eg: 1y)
"""
self.search = search
self.period = "1d" # TODO Fixed for the moment
# self.lang = lang
# self.region = region
search_category = "news"
self.name = "presearch {} {} {}".format(search, search_category, self.period)
def _fetch(self):
try:
# PreSearch fetching endpoint, parameter search keyword
presearch_fetch_endpoint = "http://selenium_app:80/fetch_presearch/?search_keyword={}".format(self.search)
# Timeout: 15 minutes
r = requests.get(presearch_fetch_endpoint, timeout=900)
# Decode
list_news = json.loads(r.text).get("list_urls", [])
except Exception as e:
logger.warning("Timeout on request: {}. {}".format(presearch_fetch_endpoint, str(e)))
list_news = []
return list_news
class FetcherGNews(FetcherAbstract):
def __init__(self, search, period, lang="en", region="US"):
"""
# period ->
- h = hours (eg: 12h)
- d = days (eg: 7d)
- m = months (eg: 6m)
- y = years (eg: 1y)
"""
self.search = search
self.period = period
self.lang = lang
self.region = region
search_category = "news"
self.name = "gnews {} {} {} {}".format(search, search_category, period, "{}-{}".format(lang, region))
def _fetch(self):
try:
list_dict_news = GNews(self.lang, self.region, period=self.period).get_news(self.search)
# Decode
list_news = []
for l in list_dict_news:
list_news.append(l.get("url"))
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
list_news = []
# Bypass Google links
list_news_redirections = GoogleByPass().bypass_google_urls(list_news)
return list_news_redirections
class FetcherGoogleNews(FetcherAbstract):
def __init__(self, search, search_category="news", period="1d", lang="en", region="US"):
assert(search_category in ["news", "general"])
self.lang = lang
self.region = region
self.period = period
self.search_category = search_category
self.search = search
self.name = "googlenews {} {} {} {}".format(search, search_category, period, "{}-{}".format(lang, region))
def _fetch(self):
try:
# Initialize
g = GoogleNews(encode="utf-8", period=self.period, lang=self.lang, region=self.region)
g.enableException(True)
if (self.search_category == "general"):
set_links = set()
# Search
g.search(self.search)
# Iterate pages
MAX_ITER_PAGES = 15
for i in range(MAX_ITER_PAGES):
time.sleep(random.uniform(1, 1.5))
num_before = len(set_links)
# Get page
try:
links = g.page_at(i)
except Exception as e:
logger.warning("Exception fetching page in GoogleNews {}: {}".format(self.name, str(e)))
break
# Links
for l in links:
# '/url?esrc=s&q=&rct=j&sa=U&url=https://www.breitbart.com/news/scent-of-luxury-indias-jasmine-infuses-global-perfume/&ved=2ahUKEwjOybGSiN-AAxX1gv0HHfqSBpMQxfQBegQICBAC&usg=AOvVaw06GdoHyzPbIopUaEuUSQPQ'
url = l.get("link").split("url=")[-1]
set_links.add(url)
num_after = len(set_links)
# Finished?
if (num_before == num_after):
logger.debug("Iterated {} pages on GoogleNews general search".format(i))
break
# To list
list_news = list(set_links)
elif (self.search_category == "news"):
# Search
g.get_news(self.search)
# Fetch
list_news = g.get_links()
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
list_news = []
# Bypass Google links
list_news_redirections = GoogleByPass().bypass_google_urls(list_news)
return list_news_redirections
class FetcherDuckDuckGo(FetcherAbstract):
def __init__(self, search, search_category, period, lang="wt", region="wt"):
assert(search_category in ["news", "general"])
assert(period in ["d", "w", "m", "y"])
self.search = search
self.search_category = search_category
self.period = period
self.lang_region = "{}-{}".format(lang, region)
self.name = "duckduckgo {} {} {} {}".format(search, search_category, "1{}".format(period), region)
def _fetch(self):
try:
list_news = []
with DDGS(timeout=10) as ddgs:
if (self.search_category == "general"):
generator_links = ddgs.text(keywords=self.search, timelimit=self.period, region=self.lang_region)
elif (self.search_category == "news"):
generator_links = ddgs.news(keywords=self.search, timelimit=self.period, region=self.lang_region)
for l in generator_links:
list_news.append( l.get("url", l.get("href")) )
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
list_news = []
return list_news
class FetcherSearxNews(FetcherAbstract):
def __init__(self, search="child abuse", searx_instance="https://serx.ml/", lang="en", region="US", search_category="news", period="day"):
assert(search_category in ["news", "general"])
assert(period in [None, "day", "week", "month", "year"])
# Random header (minimize prob of web-scrapping detection)
self.headers = {
'User-agent': str(np.random.choice(user_agents_list)),
'Accept-Encoding': 'gzip, deflate',
'Accept': '*/*',
'Connection': 'keep-alive',
}
""" # Optional header
self.headers = {
'User-agent': str(np.random.choice(user_agents_list)),
'Accept-Encoding': 'gzip, deflate, br',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'TE': 'trailers',
'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Dest': 'document',
}
"""
self.search = search
self.searx_instance = searx_instance
self.lang_region = "{}-{}".format(lang, region)
self.search_category = search_category
self.period = period
self.t_sleep_lower, self.t_sleep_higher = 0.5, 1.5
self.request_timeout = 240
period_name_mapping = {
None: "no_date_range",
"day": "1d",
"week": "1w",
"month": "1m",
"year": "1y",
}
self.name = "searxng {} {} {} {} {}".format(searx_instance.replace("https://", "").replace("/", ""), search, search_category, period_name_mapping[period], self.lang_region)
logger.info("SearX - Initialized SearX fetcher: {}".format(self.name))
def _request_and_decode(self, url_search):
# Initial random time sleep (minimize chance of getting blocked)
time.sleep(random.uniform(self.t_sleep_lower, self.t_sleep_higher))
# Request
logger.debug("SearX - Searching: {}".format(url_search))
try:
r = requests.get(url_search, headers=self.headers, timeout=self.request_timeout)
except Exception as e:
logger.warning("SearX - Exception in request: {}".format(url_search), "\n", str(e))
return []
if (r.status_code == 200):
# Status code Ok
pass
elif (r.status_code == 429):
# TooManyRequests, "Rate limit exceeded"
logger.warning("SearX {} - Too many requests while running: {}. Request output: {}".format(self.name, r.url, r.text))
return []
elif (r.status_code != 200):
logger.warning("SearX {} - Status code: {}. Request output: {}".format(self.name, r.status_code, r.text))
return []
else:
logger.debug("SearX - Status code: {}".format(r.status_code))
# Decode request
soup = BeautifulSoup(r.text, 'html.parser')
page_url_set = set()
# h3 links
for elem in soup.find_all('h3'):
# Get url
url = elem.find('a').get('href')
page_url_set.add(url)
return page_url_set
def _get_news_list(self):
############################################################
# Domain & search parameter
search_domain = os.path.join(self.searx_instance, "search?q=")
# Search keywords
search_formatted = self.search.replace(" ", "+").replace(":", "%3A")
# Period formatted
period_formatted = "&time_range={}".format(self.period) if self.period is not None else ""
# Search parameters
search_parameters = "&category_{}=on&language={}{}".format(self.search_category, self.lang_region, period_formatted)
# Combined url search
url_search_nopage = "{}{}{}".format(search_domain, search_formatted, search_parameters)
############################################################
# Request and decode on page=1
url_set = self._request_and_decode(url_search_nopage)
# No results?
if (len(url_set) == 0):
logger.warning("SearX {} - Empty results on search: {}".format(self.name, url_search_nopage))
return []
# Iterate pages
search_numpage = 2
while True:
# Combine url search with page number
url_search_with_page = "{}&pageno={}".format(url_search_nopage, search_numpage)
# Request and decode on page=X
url_set_i = self._request_and_decode(url_search_with_page)
# Length before merging
length_current = len(url_set)
# Merge
url_set = url_set.union(url_set_i)
# Length after merging
length_merged = len(url_set)
# No new elements?
if (length_current == length_merged):
logger.debug("SearX {} - Finished processing search, #pages: {}".format(self.name, search_numpage))
break
# Next page
search_numpage += 1
return list(url_set)
def _fetch(self):
try:
# Fetch news
list_news = self._get_news_list()
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
list_news = []
return list_news

View File

@@ -0,0 +1,63 @@
from .db_utils import URL_DB_Writer
from .url_utils import process_article
import logging
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
logger = logging.getLogger("news_fetcher")
class UpdateErrorURLs():
def __init__(self, db_connect_info, redis_connect_info, num_urls) -> None:
self.num_urls = num_urls
self.db_connect_info = db_connect_info
self.redis_connect_info = redis_connect_info
self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info)
def update_error_urls_status(self):
try:
logger.info("Starting updating status to URLs with error, limit #URLs: {}".format(self.num_urls))
# List of URLs with status 'error'
list_ids_and_urls = self.db_writer._get_error_urls(self.num_urls)
# Current status
current_status = "error"
# Dict: status -> IDs to update to new status
dict_status_ids, dict_status_urls = {}, {}
# Get list of (pattern, priority, status) tuples to override status if required
list_pattern_status_tuple = self.db_writer._get_pattern_status_list()
# Sort pattern tuples by priority
list_pattern_status_tuple.sort(key=lambda tup: tup[1], reverse=True)
flush_every, flush_current = 20, 0
# Iterate URLs
for (id, url) in list_ids_and_urls:
# Get status
url_canonical, article_elements, new_status = process_article(url, list_pattern_status_tuple)
# Different? Update
if (current_status != new_status):
# Extend array
dict_status_ids[new_status] = dict_status_ids.get(new_status, []) + [id]
# Debugging dict
dict_status_urls[new_status] = dict_status_urls.get(new_status, []) + [url]
# +1 processed
flush_current += 1
# Flush batch?
if (flush_every == flush_current):
logger.info("Updating status to URLs with error: {}".format(dict_status_urls))
# Update DB
self.db_writer._update_urls_status(dict_status_ids)
# Reset
flush_current = 0
dict_status_ids, dict_status_urls = {}, {}
# Flush remaining batch
if (flush_current > 0):
logger.info("Updating status to URLs with error: {}".format(dict_status_urls))
# Update DB
self.db_writer._update_urls_status(dict_status_ids)
# Reset
flush_current = 0
dict_status_ids, dict_status_urls = {}, {}
logger.info("Finished updating status to URLs with error")
except Exception as e:
logger.warning("Exception in UpdateErrorURLs.run(): {}".format(str(e)))

View File

@@ -0,0 +1,289 @@
from gnews import GNews
import dateutil.parser
from datetime import datetime, timedelta
from .utils import remove_http_s
import time
import random
import traceback
import requests
import json
import re
from bs4 import BeautifulSoup
import logging
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
logger = logging.getLogger("news_fetcher")
def get_published_date(article):
try:
"""
# Already fetched publish date information?
if (publish_date_ is not None):
return publish_date_
"""
# List of potential publish dates
potential_dates = []
# Publish date is the best match
potential_dates.append(article.publish_date)
# Publish date metadata is the following best match
potential_dates.append(article.meta_data.get('article', {}).get("published_time", None))
# Iterate remaining keys
for key in article.meta_data.keys():
if ("date" in key):
potential_dates.append(article.meta_data[key])
def invalid_date(p_date):
# Today + 2 days, article from the future?
today_plus_two = datetime.utcnow() + timedelta(days=2)
# Article from the future?
return p_date.timestamp() > today_plus_two.timestamp()
for date_ in potential_dates:
# String date? parse
if (type(date_) == str):
try:
date_ = dateutil.parser.parse(date_)
except Exception as e:
logger.info("Invalid date found while parsing potential date: {} for URL: {}".format(date_, article.url))
date_ = None
# Valid?
if (date_ is not None) and (not invalid_date(date_)):
return date_
logger.debug("Article with no published date: {}".format(article.url))
return None
except Exception as e:
logger.info("Error while retrieving published date for URL: {}".format(article.url))
return None
def get_url_host(article_source_url, url):
# https://www.blabla.com/blabla -> www.blabla.com
if (article_source_url != ""):
# Article source URL already extracted, save path if any
return remove_http_s(article_source_url) # .split("/")[0]
else:
return remove_http_s(url).split("/")[0]
def get_status_pattern_matching(url, article_status, list_pattern_status_tuple):
# Regex pattern to update status on "valid", "invalid", and "unknown" status only
# Status "raw", "duplicated" and "error" should remain the way they are
# Assumption: List of patterns sorted by importance
if (article_status in ["valid", "invalid", "unknown"]):
# Regular expression pattern matching: https://regexr.com/
for regex_pattern, regex_priority, status_if_match in list_pattern_status_tuple:
# Matching?
matching = bool(re.match(regex_pattern, url))
# Update article status
if (matching):
if (status_if_match != article_status):
logger.debug("Regex pattern found, updating status from '{}' to '{}' for URL: {}".format(article_status, status_if_match, url))
return status_if_match
# Pattern matching not required or not found, original article status
return article_status
def get_missing_kid_status(url, return_canonical_url=False):
# Sleep
time.sleep(0.75)
try:
# Request
r = requests.get(url, timeout=300)
# Decode
status_code = r.status_code
# Canonical URL removing parameters
url_canonical = r.url
except Exception as e:
logger.warning("Exception on get URL status request: {}. {}".format(url, str(e)))
status_code = None
url_canonical = url
if (status_code == 200):
status = "valid"
elif (status_code == 404):
status = "invalid"
else:
status = "unknown"
logger.debug("Missing Kid URL {} status: {}".format(url, status))
if (return_canonical_url):
return status, url_canonical
else:
return status
def bypass_google_link(article_url):
def bypass_google_consent(article_url):
# Sample URL: https://consent.google.com/m?continue=https://news.google.com/rss/articles/CBMiMGh0dHBzOi8vd3d3Lm1pc3NpbmdraWRzLm9yZy9wb3N0ZXIvbmNtYy84NjAxMTkvMdIBAA?oc%3D5&gl=NL&m=0&pc=n&cm=2&hl=en-US&src=1
article_url_no_consent = article_url.replace("https://consent.google.com/m?continue=", "")
# https://stackoverflow.com/questions/76063646/how-can-i-have-redirection-link-from-google-news-link-using-requests
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}
cookies = {'CONSENT': 'YES+cb.20220419-08-p0.cs+FX+111'}
try:
# Request
r = requests.get(article_url_no_consent, headers=headers, cookies=cookies, timeout=300)
# Decode
soup = BeautifulSoup(r.text, 'html.parser')
url_of_interest = soup.a['href']
except Exception as e:
logger.warning("Exception on request trying to G_bypass with headers: {}. {}".format(article_url_no_consent, str(e)))
url_of_interest = None
# Not able to bypass?
if (url_of_interest == "") or ("support.google.com" in url_of_interest) or ("news.google.com" in url_of_interest):
url_of_interest = None
return url_of_interest
def bypass_google_using_service(article_url):
try:
# e.g.: url = "https://news.google.com/articles/CBMiX2h0dHBzOi8vd3d3LmZveGJ1c2luZXNzLmNvbS9wb2xpdGljcy9kaXNuZXktc3Vlcy1mbG9yaWRhLWdvdi1yb24tZGVzYW50aXMtbG9zcy1zcGVjaWFsLWRpc3RyaWN00gEA?hl=en-US&gl=US&ceid=US%3Aen"
gbypass_endpoint = "http://selenium_app:80/get_redirection"
# Timeout: 5 minutes
r = requests.post(gbypass_endpoint, json={"url": article_url}, timeout=300)
# Decode
redirect_url = json.loads(r.text).get("redirect_url", "")
except Exception as e:
logger.warning("Exception on request: {}. {}".format(gbypass_endpoint, str(e)))
redirect_url = ""
return redirect_url
logger.debug("Starting gbypass_endpoint()")
article_url_bypassed = None
# Bypass using request
if ("consent.google.com" in article_url):
article_url_bypassed = bypass_google_consent(article_url)
# Not bypassed yet? Bypass using service
if (article_url_bypassed is None):
article_url_bypassed = bypass_google_using_service(article_url)
# if (article_url_bypassed is None) or (article_url_bypassed == "") or ("news.google.com" in article_url_bypassed):
if (article_url_bypassed == "") or (article_url_bypassed is None):
# Empty URL returned by Gbypass
logger.warning("Error while bypassing Gnews for URL: {}".format(article_url))
return None
else:
logger.debug("Correctly bypassed GNews to URL_redirect, from URL: {} {}".format(article_url_bypassed, article_url))
return article_url_bypassed
def process_article(article_url, list_pattern_status_tuple, language="en"):
# TODO:
"""
https://github.com/fhamborg/news-please
https://github.com/fhamborg/Giveme5W1H
https://github.com/santhoshse7en/news-fetch
"""
try:
logger.debug("Starting process_article()")
if ("news.google.com" in article_url) or ("consent.google.com" in article_url):
# Bypass to get redirection
article_url = bypass_google_link(article_url)
# Error?
if (article_url is None):
return None, {}, "error"
elif ("missingkids.org/poster" in article_url):
# Get status
article_status, url_canonical = get_missing_kid_status(article_url, return_canonical_url=True)
article_elements = {
"url_full": article_url,
"url_canonical": url_canonical
}
return url_canonical, article_elements, article_status
else:
# Avoid Too many requests (feeds, ...)
time.sleep(0.75)
logger.debug("Processing: {}".format(article_url))
# Default status unless something happens
article_status = "valid"
# Parse article
# TODO: :param proxy: The proxy parameter is a dictionary with a single key-value pair. self._proxy = {'http': proxy, 'https': proxy} if proxy else None
# TODO: Language per config
article = GNews(language).get_full_article(url=article_url)
# Article parsed?
if (article is None) or (not article.is_parsed):
logger.debug("Article not parsed: {}".format(article_url))
return article_url, {}, "error"
# Canonical link as main URL
url_canonical = article.canonical_link
# Empty canonical URL?
if (article.canonical_link is None) or (article.canonical_link == ""):
# URL with parameters? e.g. some zerohedge news fetched from newspaper3k end with #comment-stream -> Remove extra parameter in link
if ("?" in article.url) or (article.url.endswith("#comment-stream")) or (article.url.endswith("#disqus_thread")):
logger.debug("Article URL contains parameters, trying to clean URL: {}".format(article.url))
try:
# Remove text after parameter call
url = article.url.split("?")[0]
# Remove comment-stream
url = url.replace("#comment-stream", "").replace("#disqus_thread", "")
# Article
article_attempt = GNews(language).get_full_article(url=url)
# Retrieving same title? Update article based on clean URL
if (article_attempt is not None) and (article_attempt.title == article.title):
article = article_attempt
except Exception as e:
logger.info("Article parsing of URL without parameters failed: {}".format(article.url))
else: # Default behaviour
logger.debug("Article canonical link is empty, assuming URL=URL_CANONICAL: {}".format(article.url))
# By default, URL same as canonical
url_canonical = article.url
elif (article.url != article.canonical_link):
# If different, stick to canonical URL
logger.debug("Article URL and canonical link are different: {} {}".format(article.url, article.canonical_link))
else:
# If same, continue...
pass
# Update config to determine if content is valid
article.config.MIN_WORD_COUNT = 150
article.config.MIN_SENT_COUNT = 6
# Valid URL?
if (not article.is_valid_url()):
logger.debug("Not a valid news article: {}".format(url_canonical))
article_status = "invalid"
# Is the article's body text is long enough to meet standard article requirements?
if (not article.is_valid_body()):
logger.debug("Article body not valid: {}".format(url_canonical))
article_status = "unknown"
if (article.images != article.imgs):
logger.debug("Article images and imgs are different: {} {}".format(article.images, article.imgs))
# article.keywords, article.meta_keywords, article.summary
# article.movies
# article.top_image
# Check if article status needs to be updated
article_status = get_status_pattern_matching(url_canonical, article_status, list_pattern_status_tuple)
article_elements = {
'url_full': article.url, # https://www.breitbart.com/tech/2022/10/03/report-election-integrity-project-worked-with-feds-to-censor-news-sites-in-2020/
'url_host': get_url_host(article.source_url, url_canonical), # www.breitbart.com
'title': article.title, # Report: Election Integrity Partnership Worked with Feds to Censor News Sites in 2020
'description': article.meta_description, # Coalition committed to respond in early 2022 but failed to do so, while Labor has not issued a full response since taking office
'text': article.text, # ${Article content}
'published_date': get_published_date(article), # python.datetime format, obtained from "YYYY-MM-DD" or '2022-10-03T20:54:17+00:00'
'authors': article.authors, # ['Christopher Knaus']
'language': article.meta_lang, # en
'tags': list(article.tags), # ['Wide Open Border', 'My Son Hunter Movie', ...]
'images': list(article.images), # [URL_IMAGE_1, URL_IMAGE_2, ...]
'url_canonical': url_canonical, # Canonical URL (redirection)
# 'html': article.html, # HTML article
}
logger.debug("Processing OK: {}".format(url_canonical))
return url_canonical, article_elements, article_status
except Exception as e:
logger.warning("Exception processing url: {}\n{}".format(article_url, traceback.format_exc()))
return None, {}, "error"

View File

@@ -0,0 +1,64 @@
# https://techblog.willshouse.com/2012/01/03/most-common-user-agents/
user_agents_list = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/111.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; rv:111.0) Gecko/20100101 Firefox/111.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0",
"Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.44",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 OPR/96.0.0.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 OPR/97.0.0.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48",
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.34",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.39",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.2 Safari/605.1.15",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; rv:112.0) Gecko/20100101 Firefox/112.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.51",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/112.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/110.0",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0",
"Mozilla/5.0 (X11; CrOS x86_64 14541.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 YaBrowser/23.3.0.2246 Yowser/2.5 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.6.1 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 6.1; rv:102.0) Gecko/20100101 Goanna/6.0 Firefox/102.0 PaleMoon/32.0.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.41",
"Mozilla/5.0 (Windows NT 10.0; rv:110.0) Gecko/20100101 Firefox/110.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 YaBrowser/23.1.5.708 Yowser/2.5 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
]

33
app_fetcher/src/utils.py Normal file
View File

@@ -0,0 +1,33 @@
def remove_http_s(url):
url = url.replace("https://", "") if url.startswith("https://") else url
url = url.replace("http://", "") if url.startswith("http://") else url
return url
def is_valid_url(url):
if (url.startswith("https://")):
return True
else:
return False
def get_searxng_instances():
# SearxNG instances: https://searx.space/
searx_instances = set()
searx_instances.add("https://searx.work/")
searx_instances.add("https://search.ononoki.org/")
searx_instances.add("https://searxng.nicfab.eu/")
searx_instances.add("https://searx.be/")
# searx_instances.add("https://searx.fmac.xyz/")
# searx_instances.add("https://northboot.xyz/") # FIX
# searx_instances.add("https://serx.ml/") # Offline
# searx_instances.add("https://searx.ru/")
# searx_instances.add("https://searx.sp-codes.de/")
# searx_instances.add("https://searxng.nicfab.eu/")
# searx_instances.add("https://s.frlt.one/")
# searx_instances.add("https://search.sapti.me/")
# To list
list_searx_instances = list(searx_instances)
return list_searx_instances

View File

@@ -1,4 +1,5 @@
from django.db import models
from django.contrib.postgres.fields import ArrayField
# Create your models here.
class Urls(models.Model):
@@ -44,3 +45,17 @@ class UrlsSource(models.Model):
managed = False
db_table = 'urls_source'
unique_together = (('id_url', 'id_source'),)
class UrlContent(models.Model):
id_url = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url', primary_key=True)
date_published = models.DateTimeField(blank=True, null=True)
title = models.TextField(blank=True, null=True)
description = models.TextField(blank=True, null=True)
content = models.TextField(blank=True, null=True)
tags = ArrayField(models.TextField(blank=True, null=True))
authors = ArrayField(models.TextField(blank=True, null=True))
image_urls = ArrayField(models.TextField(blank=True, null=True))
class Meta:
managed = False
db_table = 'url_content'

View File

@@ -14,7 +14,7 @@
<tbody>
{% for item in page_obj %}
<tr>
<td><a href="https://{{ item.url }}/">{{ item.url }}</a></td>
<td><a href="{{ item.url }}/" target="_blank">{{ item.url }}</a></td>
<td>{{ item.ts_fetch }}</td>
<td>
{% with sources_map|dict_get:item.id as sources %}

View File

@@ -131,7 +131,7 @@
<table class="table table-bordered">
<tr>
<th>URL</th>
<td>{{ url_item.url }}</td>
<td><a href="{{ url_item.url }}" target="_blank">{{ url_item.url }}</a></td>
</tr>
<tr>
<th>Fetch Date</th>
@@ -145,9 +145,32 @@
<th>Status</th>
<td>{{ url_item.status }}</td>
</tr>
<tr>
<th>Title</th>
<td>{{ url_content.title }}</td>
</tr>
<tr>
<th>Description</th>
<td>{{ url_content.description }}</td>
</tr>
<tr>
<th>Content</th>
<td>{{ url_content.content }}</td>
</tr>
<tr>
<th>Tags</th>
<td>{{ url_content.tags }}</td>
</tr>
<tr>
<th>Authors</th>
<td>{{ url_content.authors }}</td>
</tr>
<tr>
<th>Image URLs</th>
<td>{{ url_content.image_urls }}</td>
</tr>
</table>
<!-- Independent form for optional values -->
<form onsubmit="fetchDetailsWithSelection(event, {{ url_item.id }}, '{{ url_item.url }}')">
<label for="options-{{ url_item.id }}">Model:</label>

View File

@@ -7,7 +7,7 @@ import json
import time
import ollama
from .models import Urls, Source, UrlsSource
from .models import Urls, Source, UrlsSource, UrlContent
# Create your views here.
def index(request):
@@ -60,19 +60,27 @@ def news(request):
def url_detail_view(request, id):
url_item = get_object_or_404(Urls, id=id)
url_sources = list(Source.objects.filter(urlssource__id_url=url_item).values_list('source', flat=True))
try:
url_content = UrlContent.objects.get(pk=id)
except UrlContent.DoesNotExist:
url_content = {}
#print(url_content.__dict__)
# TODO: https://github.com/ollama/ollama-python?tab=readme-ov-file#async-client
# LLM models available
client = ollama.Client(host = 'https://ollamamodel.matitos.org')
models = [m.model for m in client.list().models]
models = sorted([m.model for m in client.list().models])
print(models)
context = {
'url_item': url_item,
'sources': url_sources,
'models': models,
"prompt": "Provide in one paragraph the what, why, when, where, who, and how of the content below. Also provide a one paragraph summary of the content:",
'prompt': "Provide in one paragraph the what, why, when, where, who, and how of the content below. Also provide a one paragraph summary of the content:",
#"prompt": "Image you are a journalist, TLDR in a paragraph:",
#"prompt": "Below you will find the whole content of a news article:\n{}\nProvide a concise summary of one paragraph maximum of the content.".format(content)
'url_content': url_content,
}
return render(request, 'url_detail.html', context)