Url content
This commit is contained in:
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
__pycache__/
|
||||
*.pyc
|
||||
**/credentials.py
|
||||
206
1-DB.ipynb
206
1-DB.ipynb
@@ -2,7 +2,7 @@
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -11,16 +11,40 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"db_postgres\n",
|
||||
"\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 0/0\n",
|
||||
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
|
||||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 1/1\u001b[0m\n",
|
||||
" \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.2s \u001b[0m\n",
|
||||
"\u001b[?25h"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!docker rm -f db_postgres; docker compose -f docker/docker-compose.yml up -d ; sleep 10"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"INSERT_TABLES = False\n",
|
||||
"INSERT_SAMPLE_DATA = False\n",
|
||||
"INSERT_TABLES = True\n",
|
||||
"INSERT_SAMPLE_DATA = True\n",
|
||||
"\n",
|
||||
"import psycopg\n",
|
||||
"connection_info = \"host={} port={} user={} password={} dbname={}\".format(\"localhost\", \"5432\", \"supermatitos\", \"supermatitos\", \"matitos\")\n",
|
||||
"\n",
|
||||
"from datetime import datetime, timezone\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"if INSERT_TABLES:\n",
|
||||
" # Connect to an existing database\n",
|
||||
@@ -87,14 +111,14 @@
|
||||
" \n",
|
||||
" \n",
|
||||
" CREATE TABLE URL_CONTENT (\n",
|
||||
" id_url INTEGER REFERENCES URLS(id),\n",
|
||||
" date_published TIMESTAMPTZ NOT NULL DEFAULT NOW(),\n",
|
||||
" id_url INTEGER PRIMARY KEY REFERENCES URLS(id),\n",
|
||||
" date_published TIMESTAMPTZ DEFAULT NOW(),\n",
|
||||
" title TEXT,\n",
|
||||
" description TEXT,\n",
|
||||
" content TEXT,\n",
|
||||
" tags TEXT[],\n",
|
||||
" authors TEXT[],\n",
|
||||
" image_urls TEXT[],\n",
|
||||
" image_urls TEXT[]\n",
|
||||
" );\n",
|
||||
" CREATE INDEX idx_tags ON URL_CONTENT USING GIN(tags);\n",
|
||||
" CREATE INDEX idx_authors ON URL_CONTENT USING GIN(authors);\n",
|
||||
@@ -119,7 +143,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -137,7 +161,6 @@
|
||||
" cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.dw.com/en/trauma-how-child-abuse-victims-deal-with-parenthood/a-71833895', 'valid')\")\n",
|
||||
" cur.execute(\"INSERT INTO URLS (url, status) values ('https://nypost.com/2025/03/06/us-news/colorado-day-care-worker-hit-with-51-charges-of-child-abuse-harassment-for-slapping-toddler/', 'valid')\")\n",
|
||||
" cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.fox35orlando.com/news/tavares-police-florida-boys-10-9-abused-sheer-brutality', 'valid')\")\n",
|
||||
" # Invalid\n",
|
||||
" cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.google.com', 'invalid')\")\n",
|
||||
"\n",
|
||||
" cur.execute(\"INSERT INTO SOURCE (source) values ('news.google.com')\")\n",
|
||||
@@ -162,14 +185,157 @@
|
||||
" \n",
|
||||
" # Long URLs \n",
|
||||
" cur.execute(\"INSERT INTO URLS (url, status) values ('www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf_23dj9sdgj9sdgj8sdf8ds8f.html', 'invalid')\".format(j))\n",
|
||||
" cur.execute(\"INSERT INTO URLS (url, status) values ('www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf.html', 'invalid')\".format(j))"
|
||||
" cur.execute(\"INSERT INTO URLS (url, status) values ('www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf.html', 'invalid')\".format(j))\n",
|
||||
"\n",
|
||||
" # URL Content\n",
|
||||
" content = \"Bla Bla Bla!!!\"*25\n",
|
||||
" cur.execute(\"INSERT INTO URL_CONTENT (id_url, date_published, title, description, content, tags, authors, image_urls) values (%s, %s, 'Mommy blogger turned child abuser', %s, 'Hello there!', %s, %s, %s)\", (1, datetime.now(tz=timezone.utc), content, [\"child abuse\", \"social media\"], [\"Audrey Conklin\"], [\"https://a57.foxnews.com/static.foxnews.com/foxnews.com/content/uploads/2023/08/1440/810/image-58.jpg?ve=1&tl=1\"]))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\t urls\n",
|
||||
"[(1,\n",
|
||||
" 'https://www.foxnews.com/us/husband-ruby-franke-utah-mommy-blogger-convicted-child-abuse-regrets-wifes-fall-fame',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'valid'),\n",
|
||||
" (2,\n",
|
||||
" 'https://www.bbc.com/news/articles/ckg843y8y7no',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'valid'),\n",
|
||||
" (3,\n",
|
||||
" 'https://www.wilx.com/2025/03/05/lenawee-county-man-arrested-possessing-child-abuse-material/',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'valid'),\n",
|
||||
" (4,\n",
|
||||
" 'https://www.dw.com/en/trauma-how-child-abuse-victims-deal-with-parenthood/a-71833895',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'valid'),\n",
|
||||
" (5,\n",
|
||||
" 'https://nypost.com/2025/03/06/us-news/colorado-day-care-worker-hit-with-51-charges-of-child-abuse-harassment-for-slapping-toddler/',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'valid'),\n",
|
||||
" (6,\n",
|
||||
" 'https://www.fox35orlando.com/news/tavares-police-florida-boys-10-9-abused-sheer-brutality',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'valid'),\n",
|
||||
" (7,\n",
|
||||
" 'https://www.google.com',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (8,\n",
|
||||
" 'www.super_0.org',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (9,\n",
|
||||
" 'www.super_1.org',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (10,\n",
|
||||
" 'www.super_2.org',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (11,\n",
|
||||
" 'www.super_3.org',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (12,\n",
|
||||
" 'www.super_4.org',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (13,\n",
|
||||
" 'www.super_5.org',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (14,\n",
|
||||
" 'www.super_6.org',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (15,\n",
|
||||
" 'www.super_7.org',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (16,\n",
|
||||
" 'www.super_8.org',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (17,\n",
|
||||
" 'www.super_9.org',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (18,\n",
|
||||
" 'www.super_10.org',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (19,\n",
|
||||
" 'www.super_11.org',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (20,\n",
|
||||
" 'www.super_12.org',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (21,\n",
|
||||
" 'www.super_13.org',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (22,\n",
|
||||
" 'www.super_14.org',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (23,\n",
|
||||
" 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf_23dj9sdgj9sdgj8sdf8ds8f.html',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (24,\n",
|
||||
" 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf.html',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid')]\n",
|
||||
"\t urls_duplicate\n",
|
||||
"[]\n",
|
||||
"\t feed\n",
|
||||
"[(1,\n",
|
||||
" 'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC')]\n",
|
||||
"\t website_of_interest\n",
|
||||
"[(1, 'www.unicef.org')]\n",
|
||||
"\t search\n",
|
||||
"[(1, 'child abuse')]\n",
|
||||
"\t urls_source\n",
|
||||
"[(1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (1, 2), (2, 2), (3, 2)]\n",
|
||||
"\t source\n",
|
||||
"[(1, 'news.google.com'), (2, 'qwant.com')]\n",
|
||||
"\t website_to_filter\n",
|
||||
"[(1, 'yewtu.be'),\n",
|
||||
" (2, 'twitter.com'),\n",
|
||||
" (3, 'libreddit.de'),\n",
|
||||
" (4, 'youtube.com'),\n",
|
||||
" (5, 'tiktok.com'),\n",
|
||||
" (6, 'radio.foxnews.com')]\n",
|
||||
"\t status_pattern_matching\n",
|
||||
"[('.*missingkids.org/poster/.*', 50, 'valid')]\n",
|
||||
"\t url_content\n",
|
||||
"[(1,\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 37, 654130, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'Mommy blogger turned child abuser',\n",
|
||||
" 'Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla '\n",
|
||||
" 'Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla '\n",
|
||||
" 'Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla '\n",
|
||||
" 'Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla '\n",
|
||||
" 'Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!',\n",
|
||||
" 'Hello there!',\n",
|
||||
" ['child abuse', 'social media'],\n",
|
||||
" ['Audrey Conklin'],\n",
|
||||
" ['https://a57.foxnews.com/static.foxnews.com/foxnews.com/content/uploads/2023/08/1440/810/image-58.jpg?ve=1&tl=1'])]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from pprint import pprint\n",
|
||||
"\n",
|
||||
@@ -188,8 +354,22 @@
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "matitos",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -7,9 +7,12 @@ pip install ipykernel django requests ollama psycopg[binary] # openai
|
||||
|
||||
# Development
|
||||
|
||||
* web_app
|
||||
* app_web
|
||||
```
|
||||
|
||||
# 1) Change models.py
|
||||
python manage.py inspectdb
|
||||
|
||||
# 2)
|
||||
python manage.py makemigrations
|
||||
# 3)
|
||||
@@ -23,7 +26,7 @@ python manage.py migrate --fake-initial
|
||||
python manage.py createsuperuser
|
||||
```
|
||||
|
||||
* Image generation
|
||||
* app_img_gen
|
||||
```
|
||||
docker build -t image_generation .
|
||||
docker run --rm -it -p 12343:80 image_generation
|
||||
|
||||
16
app_fetcher/Dockerfile
Normal file
16
app_fetcher/Dockerfile
Normal file
@@ -0,0 +1,16 @@
|
||||
FROM continuumio/miniconda3:23.10.0-1
|
||||
|
||||
# App repository
|
||||
COPY . /opt/app/
|
||||
|
||||
RUN conda install -c conda-forge curl
|
||||
RUN pip install --no-cache-dir --upgrade "psycopg[binary]" git+https://github.com/ranahaani/GNews.git GoogleNews duckduckgo_search newspaper3k numpy beautifulsoup4 requests feedparser pytz redis fastapi uvicorn fastapi-utils lxml[html_clean]
|
||||
RUN pip freeze
|
||||
# GoogleNews-1.6.10 Pillow-10.1.0 PyYAML-6.0.1 aiofiles-23.2.1 anyio-3.7.1 beautifulsoup4-4.9.3 bs4-0.0.1 click-8.1.7 cssselect-1.2.0 dateparser-1.2.0 dnspython-1.16.0 duckduckgo_search-3.9.8 fastapi-0.104.1 fastapi-utils-0.2.1 feedfinder2-0.0.4 feedparser-6.0.10 filelock-3.13.1 gnews-0.3.6 greenlet-3.0.1 h11-0.14.0 h2-4.1.0 hpack-4.0.0 httpcore-1.0.2 httpx-0.25.2 hyperframe-6.0.1 jieba3k-0.35.1 joblib-1.3.2 lxml-4.9.3 newspaper3k-0.2.8 nltk-3.8.1 numpy-1.26.2 psycopg-3.1.13 psycopg-binary-3.1.13 pydantic-1.10.13 pymongo-3.12.3 python-dateutil-2.8.2 python-dotenv-0.19.2 pytz-2023.3.post1 redis-5.0.1 regex-2023.10.3 requests-2.26.0 requests-file-1.5.1 sgmllib3k-1.0.0 six-1.16.0 sniffio-1.3.0 socksio-1.0.0 soupsieve-2.5 sqlalchemy-1.4.50 starlette-0.27.0 tinysegmenter-0.3 tldextract-5.1.1 typing-extensions-4.8.0 tzlocal-5.2 uvicorn-0.24.0.post1
|
||||
|
||||
WORKDIR /opt/app
|
||||
|
||||
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "80"]
|
||||
|
||||
# docker build -t fetch_app .
|
||||
# docker run --rm --name container_fetch_app fetch_app
|
||||
12
app_fetcher/README.md
Normal file
12
app_fetcher/README.md
Normal file
@@ -0,0 +1,12 @@
|
||||
# Fetcher
|
||||
|
||||
* Fetcher app
|
||||
- Contains several endpoints to perform a specific fetching type task
|
||||
- For more details, check in [app.py](app.py) /{fetch_type}
|
||||
|
||||
* Build and run
|
||||
- Important: To be deployed with other micro-services, [docker-compose.yml](../docker-compose.yml)
|
||||
```
|
||||
docker build -t fetch_app .
|
||||
docker run --rm --name container_fetch_app fetch_app
|
||||
```
|
||||
91
app_fetcher/app.py
Normal file
91
app_fetcher/app.py
Normal file
@@ -0,0 +1,91 @@
|
||||
import src.credentials as cred
|
||||
import logging
|
||||
from logging.handlers import RotatingFileHandler
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
import os
|
||||
os.makedirs("logs", exist_ok=True)
|
||||
|
||||
# To file log
|
||||
fh = RotatingFileHandler(filename="logs/log_app_fetcher.log", mode="a", maxBytes=10000000, backupCount=4)
|
||||
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||
logger.addHandler(fh)
|
||||
|
||||
# To file log: WARNING / ERROR
|
||||
fh_ = RotatingFileHandler(filename="logs/log_app_fetcher_error.log", mode="a", maxBytes=10000000, backupCount=1)
|
||||
fh_.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||
fh_.setLevel(logging.WARNING)
|
||||
logger.addHandler(fh_)
|
||||
|
||||
logger.info("Environment: {}".format(cred.ENVIRONMENT))
|
||||
|
||||
##################################################################################################
|
||||
from src.news_feed import NewsFeed
|
||||
from src.news_parsing import NewsSiteParsing
|
||||
from src.news_search import NewsSearch
|
||||
from src.news_missing_kids import NewsMissingKids
|
||||
from src.missing_kids_status import MissingKidsStatus
|
||||
from src.url_status import UpdateErrorURLs
|
||||
from src.fetcher_status import FetcherStatus
|
||||
|
||||
from fastapi import FastAPI, BackgroundTasks
|
||||
# import requests
|
||||
# from fastapi_utils.tasks import repeat_every
|
||||
# import time
|
||||
# time.sleep(10)
|
||||
# import gc
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
@app.get("/")
|
||||
def hello_world():
|
||||
return {"message": "OK"}
|
||||
|
||||
@app.get("/{fetch_type}")
|
||||
async def fetch(background_tasks: BackgroundTasks, fetch_type: str):
|
||||
# Concurrent job running
|
||||
logger.info("Triggered fetch: {}".format(fetch_type))
|
||||
|
||||
if (fetch_type == "feeds"):
|
||||
task_run = NewsFeed(cred.db_connect_info, cred.redis_connect_info).run
|
||||
elif (fetch_type == "parser"):
|
||||
task_run = NewsSiteParsing(cred.db_connect_info, cred.redis_connect_info).run
|
||||
elif (fetch_type == "fetch_missing_kids_reduced"):
|
||||
task_run = NewsMissingKids(cred.db_connect_info, cred.redis_connect_info, num_pages=4).run
|
||||
elif (fetch_type == "fetch_missing_kids_full"):
|
||||
task_run = NewsMissingKids(cred.db_connect_info, cred.redis_connect_info, num_pages=100000).run
|
||||
elif (fetch_type == "search") or (fetch_type == "search_full"):
|
||||
task_run = NewsSearch(cred.db_connect_info, cred.redis_connect_info, full=True).run
|
||||
elif (fetch_type == "search_reduced"):
|
||||
task_run = NewsSearch(cred.db_connect_info, cred.redis_connect_info, full=False).run
|
||||
elif (fetch_type == "update_missing_kids_status_reduced"):
|
||||
task_run = MissingKidsStatus(cred.db_connect_info, cred.redis_connect_info, num_urls=50).update_missing_kids_status
|
||||
elif (fetch_type == "update_missing_kids_status_full"):
|
||||
task_run = MissingKidsStatus(cred.db_connect_info, cred.redis_connect_info, num_urls=None).update_missing_kids_status
|
||||
elif (fetch_type == "update_error_urls"):
|
||||
task_run = UpdateErrorURLs(cred.db_connect_info, cred.redis_connect_info, num_urls=100).update_error_urls_status
|
||||
elif (fetch_type == "fetch_warning_check"):
|
||||
task_run = FetcherStatus(cred.db_connect_info, cred.redis_connect_info, last_minutes_check=180).check_warning
|
||||
else:
|
||||
return {"message": "ERROR. Unknown fetcher type!"}
|
||||
|
||||
# Run task
|
||||
background_tasks.add_task(task_run)
|
||||
# Return message
|
||||
return {"message": "Started fetching {}: Ok".format(fetch_type)}
|
||||
|
||||
##################################################################################################
|
||||
|
||||
###########################
|
||||
'''
|
||||
@app.on_event("startup")
|
||||
def verify_db() -> None:
|
||||
logger.info("Testing DB connection")
|
||||
import psycopg
|
||||
with psycopg.connect(cred.db_connect_info) as conn:
|
||||
url_test_msg = "Num URLs: {}".format(conn.execute("SELECT COUNT(*) FROM URLS;").fetchall())
|
||||
logger.info(url_test_msg)
|
||||
'''
|
||||
###########################
|
||||
456
app_fetcher/src/db_utils.py
Normal file
456
app_fetcher/src/db_utils.py
Normal file
@@ -0,0 +1,456 @@
|
||||
import psycopg
|
||||
import redis
|
||||
import traceback
|
||||
import random
|
||||
import requests
|
||||
import json
|
||||
import os
|
||||
from .url_utils import process_article
|
||||
import logging
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
|
||||
# TODO: URL_DB_HANDLER, _get_search_list, _get_url_host, _get_url_host_list, ...
|
||||
# The rest, elsewhere
|
||||
|
||||
class URL_DB_Writer():
|
||||
def __init__(self, db_connect_info, redis_connect_info):
|
||||
logger.debug("Initializing URL DB writer")
|
||||
self.db_connect_info = db_connect_info
|
||||
self.redis_instance = redis.Redis(host=redis_connect_info.get("redis_host"), port=redis_connect_info.get("redis_port"))
|
||||
self.redis_expiry_seconds = redis_connect_info.get("expiry_seconds", 172800) # Default: 48 hours
|
||||
|
||||
try:
|
||||
self.redis_instance.ping()
|
||||
logger.debug("Succesfully pinged Redis")
|
||||
except Exception as e:
|
||||
logger.warning("Error trying to ping Redis: {}".format(str(e)))
|
||||
|
||||
def get_urls_count(self, last_minutes_check):
|
||||
#####################
|
||||
### Get number of URLs within last X minutes
|
||||
#####################
|
||||
try:
|
||||
# Update
|
||||
with psycopg.connect(self.db_connect_info) as conn:
|
||||
# Open cursor
|
||||
cursor = conn.cursor()
|
||||
num_urls = cursor.execute("SELECT COUNT(*) FROM URLS WHERE ts_fetch >= current_timestamp - interval '{} minutes';".format(last_minutes_check)).fetchone()[0]
|
||||
except Exception as e:
|
||||
logger.warning("Error updating URLs status: {}".format(str(e)))
|
||||
num_urls = None
|
||||
return num_urls
|
||||
|
||||
def _format(self, values):
|
||||
# Repalce single quote ' with ''. Based on https://stackoverflow.com/a/12320729
|
||||
# String -> 'string', Int -> '1' (string-based), None -> NULL (no quotes for pgSQL to interpret Null value)
|
||||
if (type(values) == list) or (type(values) == tuple):
|
||||
insert_args = "(" + ", ".join([ "NULL" if v is None else "'" + str(v).replace("'", "''") + "'" for v in values]) + ")"
|
||||
elif (type(values) == str):
|
||||
insert_args = "({})".format( "NULL" if values is None else "'" + values.replace("'", "''") + "'" )
|
||||
else:
|
||||
logger.warning("Error formatting input values: {}".format(values))
|
||||
assert False
|
||||
return insert_args
|
||||
|
||||
def _get_cached_canonical_url(self, url):
|
||||
### Redis: URL processed recently? -> Avoid increasing SERIAL counter & efficiency of DB
|
||||
try:
|
||||
filter_url = self.redis_instance.get(url)
|
||||
if (filter_url is not None):
|
||||
filter_url = filter_url.decode("utf-8")
|
||||
except Exception as e:
|
||||
logger.warning("Exception querying Redis: {}".format(str(e)))
|
||||
filter_url = None
|
||||
return filter_url
|
||||
|
||||
def _update_urls_status(self, dict_status_ids):
|
||||
#####################
|
||||
### Update status to array of URL IDs
|
||||
#####################
|
||||
try:
|
||||
# Update
|
||||
with psycopg.connect(self.db_connect_info) as conn:
|
||||
# Open cursor
|
||||
cursor = conn.cursor()
|
||||
# Autocommit at end of transaction (Atomic insert of URLs and sources)
|
||||
with conn.transaction() as tx:
|
||||
for key_status, value_ids in dict_status_ids.items():
|
||||
cursor.execute("UPDATE URLS SET status='{}' WHERE id IN ({});".format(key_status, ",".join([str(v) for v in value_ids])))
|
||||
except Exception as e:
|
||||
logger.warning("Error updating URLs status: {}".format(str(e)))
|
||||
|
||||
def _get_missing_kids_urls(self, num_urls=None):
|
||||
#####################
|
||||
### Get list of Missing Kids URLs
|
||||
#####################
|
||||
try:
|
||||
missing_kids_ids_and_urls = []
|
||||
if (num_urls is None):
|
||||
limit = 500
|
||||
else:
|
||||
limit = num_urls
|
||||
offset = 0
|
||||
with psycopg.connect(self.db_connect_info) as conn:
|
||||
# Open cursor
|
||||
cursor = conn.cursor()
|
||||
while True:
|
||||
# Query
|
||||
missing_kids_ids_and_urls_query = cursor.execute("SELECT id, url, status FROM URLS WHERE url LIKE '%missingkids.org/poster%' ORDER BY ts_fetch DESC LIMIT {} OFFSET {};".format(limit, offset)).fetchall()
|
||||
# Finished?
|
||||
if (len(missing_kids_ids_and_urls_query) == 0):
|
||||
break
|
||||
# Extend
|
||||
missing_kids_ids_and_urls = missing_kids_ids_and_urls + missing_kids_ids_and_urls_query
|
||||
# Offset
|
||||
offset += len(missing_kids_ids_and_urls_query)
|
||||
# Stop?
|
||||
if (num_urls is not None) and (len(missing_kids_ids_and_urls) >= num_urls):
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Error getting Missing Kids URLs: {}".format(str(e)))
|
||||
missing_kids_ids_and_urls = []
|
||||
return missing_kids_ids_and_urls
|
||||
|
||||
def _get_error_urls(self, num_urls=None):
|
||||
#####################
|
||||
### Get list of Missing Kids URLs
|
||||
#####################
|
||||
try:
|
||||
error_urls = []
|
||||
if (num_urls is None):
|
||||
limit = 500
|
||||
else:
|
||||
limit = num_urls
|
||||
offset = 0
|
||||
with psycopg.connect(self.db_connect_info) as conn:
|
||||
# Open cursor
|
||||
cursor = conn.cursor()
|
||||
while True:
|
||||
# Query
|
||||
error_urls_query = cursor.execute("SELECT id, url FROM URLS WHERE status='error' ORDER BY ts_fetch DESC LIMIT {} OFFSET {};".format(limit, offset)).fetchall()
|
||||
# Finished?
|
||||
if (len(error_urls_query) == 0):
|
||||
break
|
||||
# Extend
|
||||
error_urls = error_urls + error_urls_query
|
||||
# Offset
|
||||
offset += len(error_urls_query)
|
||||
# Stop?
|
||||
if (num_urls is not None) and (len(error_urls) >= num_urls):
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Error getting Error URLs: {}".format(str(e)))
|
||||
error_urls = []
|
||||
return error_urls
|
||||
|
||||
def _decode_urls(self, urls_fetched, list_domains_to_filter, list_pattern_status_tuple): # TODO: language for urls_fetched...
|
||||
"""
|
||||
# TODO: REFACTOR
|
||||
For each input url
|
||||
|
||||
Already processed?
|
||||
-> Update on Redis expire time
|
||||
-> Associate to source
|
||||
Not processed? Get main URL:
|
||||
-> URL Canonical valid?
|
||||
-> Rely on this as main URL
|
||||
-> URL Canonical not valid?
|
||||
-> Use input url, unless it's a news.google.com link
|
||||
-> If news.google.com link, filter out. REDIS?
|
||||
Main URL processing:
|
||||
-> Update in REDIS, association url -> url_canonical
|
||||
-> url != url_canonical: Add in duplicate table
|
||||
If both != news.google.com
|
||||
"""
|
||||
|
||||
# URLs to insert, URLs duplicated association, URL to Canonical form
|
||||
list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical = [], [], {}
|
||||
|
||||
# URL VS CANONICAL:
|
||||
# News URL returned: https://news.google.com/articles/CBMifmh0dHBzOi8vd3d3LmJyZWl0YmFydC5jb20vMm5kLWFtZW5kbWVudC8yMDIzLzA0LzAzL2dvdi1kZXNhbnRpcy1zaWducy1iaWxsLW1ha2luZy1mbG9yaWRhLXRoZS0yNnRoLWNvbnN0aXR1dGlvbmFsLWNhcnJ5LXN0YXRlL9IBAA?hl=en-US&gl=US&ceid=US%3Aen
|
||||
# Corresponds to canonical URL: https://www.breitbart.com/2nd-amendment/2023/04/03/gov-desantis-signs-bill-making-florida-the-26th-constitutional-carry-state/
|
||||
|
||||
for url in urls_fetched:
|
||||
# Domain to filter? Input url
|
||||
filter_due_to_domain = False
|
||||
for domain_to_filter in list_domains_to_filter:
|
||||
if (domain_to_filter in url):
|
||||
logger.debug("Domain filter applied based on {} for input URL: {}".format(domain_to_filter, url))
|
||||
filter_due_to_domain = True
|
||||
if (filter_due_to_domain):
|
||||
continue
|
||||
|
||||
# URL processed recently? -> Filter and avoid increasing SERIAL counter & efficiency of DB
|
||||
cached_canonical_url = self._get_cached_canonical_url(url)
|
||||
if (cached_canonical_url is not None):
|
||||
# Even if url processed, need to add url_canonical to list_filtered_urls, so as to associate search source to canonical URL (canonical is the main URL entry)
|
||||
dict_full_urls_to_canonical[url] = cached_canonical_url # X -> Y
|
||||
# If url has been processed, so was its canonical form
|
||||
logger.debug("Filtering out already inserted (processed) URL and its canonical form: {} {}".format(url, cached_canonical_url))
|
||||
continue
|
||||
|
||||
# Process TODO: Add language...
|
||||
url_canonical, article_elements, article_status = process_article(url, list_pattern_status_tuple)
|
||||
# TODO: Store article_elements information to insert into OS after inserted into DB (and therefore having associated url_id)
|
||||
|
||||
# Could not retrieve redirection for news.google.com based URL? Continue (avoid inserting in DB)
|
||||
if (url_canonical is None) and ("news.google.com" in url):
|
||||
logger.debug("Filtering empty canonical link for base URL based on news.google.com: {}".format(url))
|
||||
continue
|
||||
# Canonical URL still news.google.com? Continue (avoid inserting in DB)
|
||||
if (url_canonical is not None) and ("news.google.com" in url_canonical):
|
||||
logger.debug("Filtering canonical news.google.com based URL: {}".format(url_canonical))
|
||||
continue
|
||||
|
||||
# Domain to filter? Input canonical_url
|
||||
filter_due_to_domain = False
|
||||
for domain_to_filter in list_domains_to_filter:
|
||||
if (url_canonical is not None) and (domain_to_filter in url_canonical):
|
||||
filter_due_to_domain = True
|
||||
if (filter_due_to_domain):
|
||||
logger.info("Filtering due to domain input URL, Canonical_URL: {} {}".format(url, url_canonical))
|
||||
continue
|
||||
|
||||
if (url_canonical is None) or (article_status == "error"):
|
||||
logger.debug("Processing failed for URL: {}".format(url))
|
||||
# Still insert URL with "error"? -> If processed later, might have inconsistent sources (url vs url_canonical). Only store if not news.google.com based
|
||||
if ("news.google.com" in url) or ("consent.google.com" in url):
|
||||
logging.debug("Not able to process Google News link, skipping: {}".format(url))
|
||||
else:
|
||||
dict_full_urls_to_canonical[url] = url # X -> X
|
||||
list_insert_url_tuple_args.append( (url, article_status) )
|
||||
continue
|
||||
|
||||
# URL was not processed (not sure canonical yet). Generate URL_CANONICAL <-> URL_ORIGINAL association if they're different
|
||||
if (url_canonical != url):
|
||||
list_tuple_canonical_duplicate_urls.append( (url_canonical, url) )
|
||||
# Dict: url -> canonical (update association)
|
||||
dict_full_urls_to_canonical[url] = url_canonical # X -> Y or X
|
||||
|
||||
# Canonical URL processed recently? -> Filter and avoid increasing SERIAL counter & efficiency of DB
|
||||
if (self._get_cached_canonical_url(url_canonical) is not None):
|
||||
# Canonical URL was already processed
|
||||
logger.debug("Filtering out already inserted (processed) URL canonical: {}".format(url_canonical))
|
||||
else:
|
||||
# Insert url_canonical to DB formatted
|
||||
list_insert_url_tuple_args.append( (url_canonical, article_status) )
|
||||
# Canonical URL different? Process
|
||||
if (url_canonical != url):
|
||||
if ("news.google.com" in url) or ("consent.google.com" in url):
|
||||
logging.debug("Not adding google.news.com based link, skipping: {}".format(url))
|
||||
else:
|
||||
# Fetched url -> duplicate (using canonical as main link)
|
||||
article_status = "duplicate"
|
||||
# Insert url (non-canonical) to DB formatted
|
||||
list_insert_url_tuple_args.append( (url, article_status) )
|
||||
|
||||
return list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical
|
||||
|
||||
def _insert_urls(self, cursor, list_insert_url_tuple_args):
|
||||
#####################
|
||||
### Insert URLs with status
|
||||
#####################
|
||||
if (len(list_insert_url_tuple_args) > 0):
|
||||
insert_args = ', '.join( [ self._format(t) for t in list_insert_url_tuple_args] )
|
||||
# Insert. (url_1, status_1), (url_2, status_2), ...
|
||||
sql_code = "INSERT INTO URLS {} VALUES {} ON CONFLICT (url) DO NOTHING;".format("(url, status)", insert_args)
|
||||
# logger.debug("SQL CODE: {}".format(sql_code))
|
||||
c = cursor.execute(sql_code)
|
||||
# NOTE: Not using "RETURNING id" since previously inserted URLs are not returned (ON CONFLICT)
|
||||
# https://stackoverflow.com/questions/35949877/how-to-include-excluded-rows-in-returning-from-insert-on-conflict/35953488#35953488
|
||||
|
||||
def _insert_urls_duplicated(self, cursor, list_tuple_canonical_duplicate_urls):
|
||||
#####################
|
||||
### Insert duplicated URLs
|
||||
#####################
|
||||
if (len(list_tuple_canonical_duplicate_urls) > 0):
|
||||
# Flatten, format, set to remove duplicates
|
||||
args_duplicated_urls_set = "(" + ', '.join( set( [ "'" + str(y).replace("'", "''") + "'" for x in list_tuple_canonical_duplicate_urls for y in x] ) ) + ")"
|
||||
|
||||
# Dict: url -> id
|
||||
dict_url_to_id = {}
|
||||
# Get url -> id association to populate duplicated URLs
|
||||
for (id_, url_) in cursor.execute("SELECT id, url FROM URLS WHERE url IN {};".format(args_duplicated_urls_set)).fetchall():
|
||||
dict_url_to_id[url_] = id_
|
||||
|
||||
# Convert tuples (url_canonical, url) -> (id_url_canonical, id_url) to insert in DB
|
||||
# ORIGINAL CODE. Issue, might not have found association to all urls
|
||||
### list_tuple_canonical_duplicate_urls_ids = [ (dict_url_to_id[t[0]], dict_url_to_id[t[1]]) for t in list_tuple_canonical_duplicate_urls]
|
||||
|
||||
list_tuple_canonical_duplicate_urls_ids = []
|
||||
for (url_1, url_2) in list_tuple_canonical_duplicate_urls:
|
||||
id_url_1, id_url_2 = dict_url_to_id.get(url_1), dict_url_to_id.get(url_2)
|
||||
if (id_url_1 is None) or (id_url_2 is None):
|
||||
logger.debug("Skipping duplicate association due to no url -> id_url mapping available for tuple: {} {}".format(url_1, url_2))
|
||||
else:
|
||||
list_tuple_canonical_duplicate_urls_ids.append( (id_url_1, id_url_2) )
|
||||
|
||||
if (len(list_tuple_canonical_duplicate_urls_ids) > 0):
|
||||
insert_args = ', '.join( [ self._format(t) for t in list_tuple_canonical_duplicate_urls_ids] )
|
||||
# Insert. (id_url_canonical_1, id_url_1), ...
|
||||
sql_code = "INSERT INTO URLS_DUPLICATE {} VALUES {} ON CONFLICT DO NOTHING;".format("(id_url_canonical, id_url_duplicated)", insert_args)
|
||||
# logger.debug("SQL CODE: {}".format(sql_code))
|
||||
c = cursor.execute(sql_code)
|
||||
|
||||
def _get_pattern_status_list(self):
|
||||
#####################
|
||||
### Get list of domains to filter
|
||||
#####################
|
||||
# TODO: Cache on redis and query once every N hours? ...
|
||||
try:
|
||||
with psycopg.connect(self.db_connect_info) as conn:
|
||||
# Open cursor
|
||||
cursor = conn.cursor()
|
||||
# TODO: Cache on Redis
|
||||
list_pattern_status = cursor.execute("SELECT pattern, priority, status FROM STATUS_PATTERN_MATCHING;").fetchall()
|
||||
except Exception as e:
|
||||
logger.warning("Error getting pattern status list: {}".format(str(e)))
|
||||
list_pattern_status = []
|
||||
return list_pattern_status
|
||||
|
||||
def _get_domains_to_filter(self):
|
||||
#####################
|
||||
### Get list of domains to filter
|
||||
#####################
|
||||
# TODO: Cache on redis and query once every N hours? ...
|
||||
try:
|
||||
with psycopg.connect(self.db_connect_info) as conn:
|
||||
# Open cursor
|
||||
cursor = conn.cursor()
|
||||
# TODO: Cache on Redis
|
||||
sites_to_filter = [e[0] for e in cursor.execute("SELECT url_host FROM WEBSITE_TO_FILTER;").fetchall() ]
|
||||
except Exception as e:
|
||||
logger.warning("Error getting domains to filter: {}".format(str(e)))
|
||||
sites_to_filter = []
|
||||
return sites_to_filter
|
||||
|
||||
def _get_cached_source_id(self, source):
|
||||
### Redis: URL processed recently? -> Avoid increasing SERIAL counter & efficiency of DB
|
||||
try:
|
||||
source_id = self.redis_instance.get(source)
|
||||
if (source_id is not None):
|
||||
source_id = source_id.decode("utf-8")
|
||||
except Exception as e:
|
||||
logger.warning("Exception querying Redis: {}".format(str(e)))
|
||||
source_id = None
|
||||
return source_id
|
||||
|
||||
def _get_source_id(self, cursor, source):
|
||||
#####################
|
||||
### Get source corresponding id
|
||||
#####################
|
||||
# Cached?
|
||||
id_source = self._get_cached_source_id(source)
|
||||
if (id_source is None):
|
||||
c = cursor.execute("SELECT id FROM SOURCE WHERE source='{}'".format(source.replace("'", "''"))).fetchone()
|
||||
if (c is None) or (len(c) == 0):
|
||||
# Source does not exist, insert and get id
|
||||
c = cursor.execute("INSERT INTO SOURCE (source) VALUES ('{}') RETURNING id;".format(source.replace("'", "''"))).fetchone()
|
||||
# Decode source id
|
||||
id_source = c[0]
|
||||
# Cache
|
||||
self.redis_instance.set(source, id_source, ex=self.redis_expiry_seconds)
|
||||
return id_source
|
||||
|
||||
def _get_urls_id(self, cursor, urls_full):
|
||||
#####################
|
||||
### Get id of inserted and filtered URLs
|
||||
#####################
|
||||
# TODO: Cache url -> url_id, url_canonical
|
||||
if (len(urls_full) == 0):
|
||||
return []
|
||||
# Get inserted and filtered URL ids (unnested). Filtered URLs are also retrieved since they might have been fetched from a new source
|
||||
in_inserted_filtered_urls = "(" + ', '.join(["'" + u.replace("'", "''") + "'" for u in urls_full]) + ")"
|
||||
id_urls_related = [ i[0] for i in cursor.execute("SELECT id FROM URLS WHERE url IN {};".format(in_inserted_filtered_urls)).fetchall() ]
|
||||
return id_urls_related
|
||||
|
||||
def _insert_urls_source(self, cursor, id_urls_related, id_source):
|
||||
#####################
|
||||
### Insert URL sources: (id_url_1, id_source), (id_url_2, id_source), ...
|
||||
#####################
|
||||
if (len(id_urls_related) == 0) or (id_source is None):
|
||||
return
|
||||
columns = "(id_url, id_source)"
|
||||
insert_args = ', '.join( [ self._format([id_url, id_source]) for id_url in id_urls_related ] )
|
||||
# Insert
|
||||
sql_code = "INSERT INTO URLS_SOURCE {} VALUES {} ON CONFLICT DO NOTHING;".format(columns, insert_args)
|
||||
# logger.debug("SQL CODE: {}".format(sql_code))
|
||||
c = cursor.execute(sql_code)
|
||||
|
||||
def write_batch(self, urls_fetched, source):
|
||||
# Chunks of 50 elements
|
||||
n = 50
|
||||
# Divide in small chunks
|
||||
urls_fetched_chunks = [urls_fetched[i:i + n] for i in range(0, len(urls_fetched), n)]
|
||||
# Process
|
||||
for urls_fetched_chunk_i in urls_fetched_chunks:
|
||||
self._write_small_batch(urls_fetched_chunk_i, source)
|
||||
|
||||
def _write_small_batch(self, urls_fetched, source):
|
||||
try:
|
||||
logger.info("Fetched #{} URLs, source: {}".format(len(urls_fetched), source))
|
||||
|
||||
if (len(urls_fetched) == 0):
|
||||
logger.debug("Empty batch of urls (not writing to DB) for source: {}".format(source))
|
||||
return
|
||||
|
||||
# Shuffle URLs to reduce continuous URLs of same URL host (minimize chance of being blocked for too many continuous requests)
|
||||
random.shuffle(urls_fetched)
|
||||
|
||||
# Get list of domains to filter
|
||||
list_domains_to_filter = self._get_domains_to_filter()
|
||||
# Get list of (pattern, priority, status) tuples to override status if required
|
||||
list_pattern_status_tuple = self._get_pattern_status_list()
|
||||
# Sort pattern tuples by priority
|
||||
list_pattern_status_tuple.sort(key=lambda tup: tup[1], reverse=True)
|
||||
|
||||
# Process URLs to update DB
|
||||
list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical = self._decode_urls(urls_fetched, list_domains_to_filter, list_pattern_status_tuple)
|
||||
# Full set of URL and its canonical form (to associate them to a search), both to insert and filter
|
||||
urls_full = set(dict_full_urls_to_canonical.keys()).union( set(dict_full_urls_to_canonical.values()) )
|
||||
|
||||
# Insert
|
||||
with psycopg.connect(self.db_connect_info) as conn:
|
||||
# Open cursor
|
||||
cursor = conn.cursor()
|
||||
# Autocommit at end of transaction (Atomic insert of URLs and sources)
|
||||
with conn.transaction() as tx:
|
||||
# Insert processed URLs
|
||||
self._insert_urls(cursor, list_insert_url_tuple_args)
|
||||
# Insert URLs duplicated (canonical != fetched url)
|
||||
self._insert_urls_duplicated(cursor, list_tuple_canonical_duplicate_urls)
|
||||
|
||||
# Get source id in DB
|
||||
id_source = self._get_source_id(cursor, source)
|
||||
# Get IDs of all related URLs
|
||||
id_urls_related = self._get_urls_id(cursor, urls_full)
|
||||
# Insert search source associated to URLs
|
||||
self._insert_urls_source(cursor, id_urls_related, id_source)
|
||||
|
||||
# Update Redis status of inserted and filtered URLs after writing to DB
|
||||
for url, url_canonical in dict_full_urls_to_canonical.items():
|
||||
try:
|
||||
# Set with updated expiry time
|
||||
self.redis_instance.set(url, url_canonical, ex=self.redis_expiry_seconds)
|
||||
if (url != url_canonical):
|
||||
self.redis_instance.set(url_canonical, url_canonical, ex=self.redis_expiry_seconds)
|
||||
except Exception as e:
|
||||
logger.warning("Exception running set in Redis: {}".format(str(e)))
|
||||
|
||||
if (len(list_insert_url_tuple_args) > 0):
|
||||
try:
|
||||
webhook_token = os.environ.get("CLIQ_WEBHOOK_TOKEN")
|
||||
endpoint_message = "https://cliq.zoho.com/api/v2/channelsbyname/urlretrievalbot/message?zapikey={}".format(webhook_token)
|
||||
|
||||
payload = json.dumps({"text": "Fetched #{} new URLs, source: {}".format(len(list_insert_url_tuple_args), source) })
|
||||
r = requests.post(endpoint_message, data=payload)
|
||||
except Exception as e:
|
||||
logger.warning("Webhook failed: {}".format(str(e)))
|
||||
|
||||
logger.debug("URL DB write finished")
|
||||
except Exception as e:
|
||||
logger.warning( "Exception writing to URL_DB:\n{}".format(traceback.format_exc()) )
|
||||
logger.debug( "Exception --- List of URLs: {}".format(str(urls_fetched)) )
|
||||
39
app_fetcher/src/fetcher_status.py
Normal file
39
app_fetcher/src/fetcher_status.py
Normal file
@@ -0,0 +1,39 @@
|
||||
from .db_utils import URL_DB_Writer
|
||||
import json
|
||||
import logging
|
||||
import requests
|
||||
import os
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
|
||||
class FetcherStatus():
|
||||
def __init__(self, db_connect_info, redis_connect_info, last_minutes_check) -> None:
|
||||
self.db_connect_info = db_connect_info
|
||||
self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info)
|
||||
self.last_minutes_check = last_minutes_check
|
||||
|
||||
def check_warning(self):
|
||||
try:
|
||||
logger.info("Starting fetcher check for last minutes {}".format(self.last_minutes_check))
|
||||
|
||||
# Get number of URLs
|
||||
num_urls = self.db_writer.get_urls_count(last_minutes_check=self.last_minutes_check)
|
||||
logger.debug("Fetched #URLs {} during the last {} minutes".format(num_urls, self.last_minutes_check))
|
||||
|
||||
webhook_token = os.environ.get("CLIQ_WEBHOOK_TOKEN")
|
||||
endpoint_message = "https://cliq.zoho.com/api/v2/channelsbyname/urlfetchwarnings/message?zapikey={}".format(webhook_token)
|
||||
|
||||
if (num_urls is None):
|
||||
try:
|
||||
payload = json.dumps({"text": "[WARNING] Error on query to DB"})
|
||||
r = requests.post(endpoint_message, data=payload)
|
||||
except Exception as e:
|
||||
logger.warning("Webhook failed: {}".format(str(e)))
|
||||
elif (num_urls == 0):
|
||||
try:
|
||||
payload = json.dumps({"text": "[WARNING] No URLs fetched for {} minutes".format(self.last_minutes_check) })
|
||||
r = requests.post(endpoint_message, data=payload)
|
||||
except Exception as e:
|
||||
logger.warning("Webhook failed: {}".format(str(e)))
|
||||
except Exception as e:
|
||||
logger.warning("Exception in UpdateErrorURLs.run(): {}".format(str(e)))
|
||||
27
app_fetcher/src/google_bypass.py
Normal file
27
app_fetcher/src/google_bypass.py
Normal file
@@ -0,0 +1,27 @@
|
||||
import requests
|
||||
import json
|
||||
import logging
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
|
||||
class GoogleByPass():
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
|
||||
def bypass_google_urls(self, list_urls):
|
||||
if (len(list_urls) == 0):
|
||||
return []
|
||||
|
||||
try:
|
||||
# Endpoint
|
||||
gbypass_endpoint = "http://selenium_app:80/get_redirection"
|
||||
# Timeout: 20 minutes
|
||||
timeout = 60*20
|
||||
r = requests.post(gbypass_endpoint, json={"list_urls": list_urls}, timeout=timeout)
|
||||
# Decode
|
||||
list_urls_redirections = json.loads(r.text).get("list_urls_redirections", [])
|
||||
except Exception as e:
|
||||
logger.warning("Exception on request: {}. {}".format(gbypass_endpoint, str(e)))
|
||||
list_urls_redirections = []
|
||||
|
||||
return list_urls_redirections
|
||||
69
app_fetcher/src/missing_kids_status.py
Normal file
69
app_fetcher/src/missing_kids_status.py
Normal file
@@ -0,0 +1,69 @@
|
||||
import requests
|
||||
from .db_utils import URL_DB_Writer
|
||||
from .url_utils import get_missing_kid_status
|
||||
import time
|
||||
import logging
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
|
||||
class MissingKidsStatus():
|
||||
def __init__(self, db_connect_info, redis_connect_info, num_urls) -> None:
|
||||
self.num_urls = num_urls
|
||||
self.db_connect_info = db_connect_info
|
||||
self.redis_connect_info = redis_connect_info
|
||||
self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info)
|
||||
|
||||
def update_missing_kids_status(self):
|
||||
try:
|
||||
logger.info("Starting updating status to Missing Kids URLs, limit #URLs: {}".format(self.num_urls))
|
||||
# List of URLs
|
||||
list_ids_and_urls = self.db_writer._get_missing_kids_urls(self.num_urls)
|
||||
# Dict: status -> IDs to update to new status
|
||||
dict_status_ids, dict_status_urls = {}, {}
|
||||
# Check URLs with invalid status?
|
||||
skip_invalid_check = False
|
||||
|
||||
flush_every, flush_current = 20, 0
|
||||
# Iterate URLs
|
||||
for (id, url, current_status) in list_ids_and_urls:
|
||||
# Skip duplicate URLs
|
||||
if (current_status == "duplicate"):
|
||||
continue
|
||||
# Skip invalid URLs?
|
||||
if (skip_invalid_check):
|
||||
if (current_status == "invalid"):
|
||||
continue
|
||||
|
||||
# Get status
|
||||
new_status = get_missing_kid_status(url)
|
||||
# Different? Update
|
||||
if (current_status != new_status):
|
||||
# Extend array
|
||||
dict_status_ids[new_status] = dict_status_ids.get(new_status, []) + [id]
|
||||
# Debugging dict
|
||||
dict_status_urls[new_status] = dict_status_urls.get(new_status, []) + [url]
|
||||
# +1 processed
|
||||
flush_current += 1
|
||||
|
||||
# Flush batch?
|
||||
if (flush_every == flush_current):
|
||||
logger.info("Updating status to Missing Kids URLs: {}".format(dict_status_urls))
|
||||
# Update DB
|
||||
self.db_writer._update_urls_status(dict_status_ids)
|
||||
# Reset
|
||||
flush_current = 0
|
||||
dict_status_ids, dict_status_urls = {}, {}
|
||||
|
||||
# Flush remaining batch
|
||||
if (flush_current > 0):
|
||||
logger.info("Updating status to Missing Kids URLs: {}".format(dict_status_urls))
|
||||
# Update DB
|
||||
self.db_writer._update_urls_status(dict_status_ids)
|
||||
# Reset
|
||||
flush_current = 0
|
||||
dict_status_ids, dict_status_urls = {}, {}
|
||||
|
||||
logger.info("Finished updating status to Missing Kids URLs")
|
||||
except Exception as e:
|
||||
logger.warning("Exception in MissingKidsStatus.run(): {}".format(str(e)))
|
||||
|
||||
60
app_fetcher/src/news_feed.py
Normal file
60
app_fetcher/src/news_feed.py
Normal file
@@ -0,0 +1,60 @@
|
||||
from .db_utils import URL_DB_Writer
|
||||
import feedparser
|
||||
import dateutil
|
||||
import psycopg
|
||||
import logging
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
|
||||
class NewsFeed():
|
||||
def __init__(self, db_connect_info, redis_connect_info) -> None:
|
||||
logger.debug("Initializing News feed")
|
||||
self.db_connect_info = db_connect_info
|
||||
self.redis_connect_info = redis_connect_info
|
||||
|
||||
def _get_feed_urls(self):
|
||||
try:
|
||||
with psycopg.connect(self.db_connect_info) as conn:
|
||||
list_url_feeds = conn.execute("SELECT rss_feed FROM FEED;").fetchall()
|
||||
# Decode (tuple with 1 element)
|
||||
list_url_feeds = [l[0] for l in list_url_feeds]
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching RSS sites: " + str(e))
|
||||
list_url_feeds = []
|
||||
return list_url_feeds
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
logger.debug("Starting NewsFeed.run()")
|
||||
|
||||
# Get feeds
|
||||
list_url_feeds = self._get_feed_urls()
|
||||
logger.debug("Fetching news from feeds: {}".format(str(list_url_feeds)))
|
||||
|
||||
# Process via RSS feeds
|
||||
for url_feed in list_url_feeds:
|
||||
# Initialize
|
||||
urls_fetched, urls_publish_date = [], []
|
||||
# Fetch feeds
|
||||
feeds = feedparser.parse(url_feed)
|
||||
# Parse
|
||||
for f in feeds.get("entries", []):
|
||||
# Get URL
|
||||
url = f.get("link", None)
|
||||
# Process?
|
||||
if (url is not None):
|
||||
# Available publish date?
|
||||
publish_date = f.get("published", None)
|
||||
if (publish_date is not None):
|
||||
publish_date = dateutil.parser.parse(publish_date)
|
||||
urls_publish_date.append(publish_date)
|
||||
# URL
|
||||
urls_fetched.append(url)
|
||||
|
||||
# URL fetching source
|
||||
source = "feed {}".format(url_feed)
|
||||
# Write to DB
|
||||
db_writer = URL_DB_Writer(self.db_connect_info, self.redis_connect_info)
|
||||
db_writer.write_batch(urls_fetched, source)
|
||||
except Exception as e:
|
||||
logger.warning("Exception in NewsFeed.run(): {}".format(str(e)))
|
||||
40
app_fetcher/src/news_missing_kids.py
Normal file
40
app_fetcher/src/news_missing_kids.py
Normal file
@@ -0,0 +1,40 @@
|
||||
from .db_utils import URL_DB_Writer
|
||||
import requests
|
||||
import json
|
||||
import logging
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
|
||||
class NewsMissingKids():
|
||||
def __init__(self, db_connect_info, redis_connect_info, num_pages) -> None:
|
||||
logger.debug("Initializing News MissingKids")
|
||||
self.db_connect_info = db_connect_info
|
||||
self.redis_connect_info = redis_connect_info
|
||||
self.num_pages = num_pages
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
logger.debug("Starting NewsMissingKids.run()")
|
||||
try:
|
||||
# Missing kids fetching endpoint, parameter number of pages to fetch
|
||||
missingkids_fetch_endpoint = "http://selenium_app:80/get_missing_kids/?pages={}".format(self.num_pages)
|
||||
# Timeout
|
||||
if (self.num_pages > 15):
|
||||
timeout = 60*90 # 1.5h
|
||||
else:
|
||||
timeout = 60*5 # 5 min
|
||||
# Request
|
||||
r = requests.get(missingkids_fetch_endpoint, timeout=timeout)
|
||||
# Decode
|
||||
urls_fetched = json.loads(r.text).get("list_urls", [])
|
||||
except Exception as e:
|
||||
logger.warning("Timeout on request: {}. {}".format(missingkids_fetch_endpoint, str(e)))
|
||||
urls_fetched = []
|
||||
|
||||
# URL fetching source
|
||||
source = "missingkids fetcher"
|
||||
# Write to DB
|
||||
db_writer = URL_DB_Writer(self.db_connect_info, self.redis_connect_info)
|
||||
db_writer.write_batch(urls_fetched, source)
|
||||
except Exception as e:
|
||||
logger.warning("Exception in NewsMissingKids.run(): {}".format(str(e)))
|
||||
58
app_fetcher/src/news_parsing.py
Normal file
58
app_fetcher/src/news_parsing.py
Normal file
@@ -0,0 +1,58 @@
|
||||
from .db_utils import URL_DB_Writer
|
||||
import newspaper
|
||||
import psycopg
|
||||
import logging
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
|
||||
class NewsSiteParsing():
|
||||
def __init__(self, db_connect_info, redis_connect_info) -> None:
|
||||
logger.debug("Initializing News SiteParsing newspaper3k")
|
||||
self.db_connect_info = db_connect_info
|
||||
self.redis_connect_info = redis_connect_info
|
||||
|
||||
def _get_url_hosts(self):
|
||||
try:
|
||||
with psycopg.connect(self.db_connect_info) as conn:
|
||||
list_url_hosts = conn.execute("SELECT url_host FROM WEBSITE_OF_INTEREST;").fetchall()
|
||||
# Decode (tuple with 1 element)
|
||||
list_url_hosts = [l[0] for l in list_url_hosts]
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching RSS sites: " + str(e))
|
||||
list_url_hosts = []
|
||||
return list_url_hosts
|
||||
|
||||
def _postprocess(self, article_urls):
|
||||
return [url.replace("#comment-stream", "") for url in article_urls]
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
logger.debug("Starting NewsSiteParsing.run() for {}")
|
||||
|
||||
# Get feeds
|
||||
list_url_hosts = self._get_url_hosts()
|
||||
logger.info("Fetching news by parsing URL hosts: {}".format(str(list_url_hosts)))
|
||||
|
||||
# Process newspaper3k build method
|
||||
for url_host_feed in list_url_hosts:
|
||||
# Protocol
|
||||
if not (url_host_feed.startswith("http")):
|
||||
url_host_feed_formatted = "https://" + url_host_feed
|
||||
else:
|
||||
url_host_feed_formatted = url_host_feed
|
||||
|
||||
logger.debug("Fetching newspaper3k parsing based on URL: {}".format(url_host_feed_formatted))
|
||||
# Source object
|
||||
url_host_built = newspaper.build(url_host_feed_formatted)
|
||||
# Get articles URL list
|
||||
urls_fetched = url_host_built.article_urls()
|
||||
# Post-processing
|
||||
urls_fetched = self._postprocess(urls_fetched)
|
||||
|
||||
# URL fetching source
|
||||
source = "newspaper3k {}".format(url_host_feed)
|
||||
# Write to DB
|
||||
db_writer = URL_DB_Writer(self.db_connect_info, self.redis_connect_info)
|
||||
db_writer.write_batch(urls_fetched, source)
|
||||
except Exception as e:
|
||||
logger.warning("Exception in NewsSiteParsing.run(): {}".format(str(e)))
|
||||
181
app_fetcher/src/news_search.py
Normal file
181
app_fetcher/src/news_search.py
Normal file
@@ -0,0 +1,181 @@
|
||||
from .db_utils import URL_DB_Writer
|
||||
import psycopg
|
||||
from .utils import get_searxng_instances
|
||||
from .search_sources import FetcherDuckDuckGo, FetcherGNews, FetcherGoogleNews, FetcherSearxNews, FetcherPreSearch
|
||||
from threading import Thread
|
||||
import time
|
||||
import random
|
||||
import logging
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
|
||||
class NewsSearch():
|
||||
def __init__(self, db_connect_info, redis_connect_info, full=True) -> None:
|
||||
logger.debug("Initializing News feed")
|
||||
self.db_connect_info = db_connect_info
|
||||
self.redis_connect_info = redis_connect_info
|
||||
self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info)
|
||||
self.full_search = full
|
||||
|
||||
def _get_url_host_list(self):
|
||||
try:
|
||||
with psycopg.connect(self.db_connect_info) as conn:
|
||||
# List of URL host
|
||||
list_url_host = [l[0] for l in conn.execute("SELECT url_host FROM WEBSITE_OF_INTEREST;").fetchall()]
|
||||
# Clean http / https from URLs
|
||||
list_url_host = [l.replace("https://", "").replace("http://", "") for l in list_url_host]
|
||||
# Clean last slash if exists
|
||||
list_url_host = [ l if not l.endswith("/") else l[:-1] for l in list_url_host]
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching URL host list: " + str(e))
|
||||
list_url_host = []
|
||||
return list_url_host
|
||||
|
||||
def _get_search_list(self):
|
||||
try:
|
||||
with psycopg.connect(self.db_connect_info) as conn:
|
||||
# List of keyword searches
|
||||
list_search_text = [l[0] for l in conn.execute("SELECT keyword_search FROM SEARCH;").fetchall()]
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching searches list: " + str(e))
|
||||
list_search_text = []
|
||||
return list_search_text
|
||||
|
||||
def _run_fetching(self, search_text):
|
||||
logger.debug("Starting _run_fetching() for {}".format(search_text))
|
||||
|
||||
# Initialize DB Writer
|
||||
db_writer = URL_DB_Writer(self.db_connect_info, self.redis_connect_info)
|
||||
|
||||
# Common parameters
|
||||
lang, region = "en", "US"
|
||||
|
||||
### PreSearch
|
||||
dict_params_news = {"search": search_text}
|
||||
FetcherPreSearch(**dict_params_news).fetch_articles(db_writer)
|
||||
|
||||
### DuckDuckGo
|
||||
period = "d"
|
||||
dict_params_news = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "news", "period": period}
|
||||
FetcherDuckDuckGo(**dict_params_news).fetch_articles(db_writer)
|
||||
dict_params_general = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "general", "period": period}
|
||||
FetcherDuckDuckGo(**dict_params_general).fetch_articles(db_writer)
|
||||
|
||||
if (self.full_search):
|
||||
# Avoid site:{} search due to G-Bypass required time
|
||||
if ("site:" not in search_text):
|
||||
### GNews
|
||||
dict_params = {"search": search_text, "lang": "wt", "region": "wt", "period": period}
|
||||
FetcherGNews(**dict_params).fetch_articles(db_writer)
|
||||
|
||||
### GoogleNews
|
||||
dict_params_news = {"search": search_text, "lang": lang, "region": region, "search_category": "news", "period": period}
|
||||
FetcherGoogleNews(**dict_params_news).fetch_articles(db_writer)
|
||||
# dict_params_general = {"search": search_text, "lang": lang, "region": region, "search_category": "general", "period": period}
|
||||
|
||||
|
||||
'''
|
||||
# Method run concurrently, minimize overlapping
|
||||
time.sleep(random.uniform(1, 15))
|
||||
list_threads = []
|
||||
|
||||
def run_search(FetcherObject, dict_params):
|
||||
# Initialize DB Writer
|
||||
db_writer = URL_DB_Writer(self.db_connect_info, self.redis_connect_info)
|
||||
# Fetch and write to DB
|
||||
FetcherObject(**dict_params).fetch_articles(db_writer)
|
||||
|
||||
"""
|
||||
### SearxNG
|
||||
period = "day"
|
||||
for searx_instance in get_searxng_instances():
|
||||
dict_params_news = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "news", "period": period}
|
||||
dict_params_general = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "general", "period": period}
|
||||
# Append thread
|
||||
list_threads.append( Thread(target=run_search, args=(FetcherSearxNews, dict_params_news, )) )
|
||||
list_threads.append( Thread(target=run_search, args=(FetcherSearxNews, dict_params_general, )) )
|
||||
"""
|
||||
|
||||
### PreSearch
|
||||
dict_params_news = {"search": search_text}
|
||||
list_threads.append( Thread(target=run_search, args=(FetcherPreSearch, dict_params_news, )) )
|
||||
|
||||
### DuckDuckGo
|
||||
period = "d"
|
||||
dict_params_news = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "news", "period": period}
|
||||
dict_params_general = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "general", "period": period}
|
||||
# Append thread
|
||||
list_threads.append( Thread(target=run_search, args=(FetcherDuckDuckGo, dict_params_news, )) )
|
||||
list_threads.append( Thread(target=run_search, args=(FetcherDuckDuckGo, dict_params_general, )) )
|
||||
|
||||
if (self.full_search):
|
||||
# Avoid site:{} search due to G-Bypass required time
|
||||
if ("site:" not in search_text):
|
||||
### GNews
|
||||
for period in ["1d"]: # ["1d", "6h"]:
|
||||
dict_params = {"search": search_text, "lang": "wt", "region": "wt", "period": period}
|
||||
# Append thread
|
||||
list_threads.append( Thread(target=run_search, args=(FetcherGNews, dict_params, )) )
|
||||
|
||||
### GoogleNews
|
||||
for period in ["1d"]: # ["1d", "6h"]:
|
||||
# News
|
||||
dict_params_news = {"search": search_text, "lang": lang, "region": region, "search_category": "news", "period": period}
|
||||
list_threads.append( Thread(target=run_search, args=(FetcherGoogleNews, dict_params_news, )) )
|
||||
if False:
|
||||
dict_params_general = {"search": search_text, "lang": lang, "region": region, "search_category": "general", "period": period}
|
||||
list_threads.append( Thread(target=run_search, args=(FetcherGoogleNews, dict_params_general, )) )
|
||||
|
||||
# Run
|
||||
MULTITHREADED = False
|
||||
logger.debug("Fetching threads starting")
|
||||
if MULTITHREADED:
|
||||
for t in list_threads:
|
||||
t.start()
|
||||
# Join
|
||||
for t in list_threads:
|
||||
t.join()
|
||||
else:
|
||||
for t in list_threads:
|
||||
t.start()
|
||||
t.join()
|
||||
logger.debug("Fetching threads finished")
|
||||
'''
|
||||
logger.debug("Finished _run_fetching()")
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
logger.info("Fetching text searches & URL hosts of interest")
|
||||
|
||||
# Get text searches of interest
|
||||
list_search_text_of_interest = self._get_search_list()
|
||||
|
||||
# Get URL host of interest
|
||||
list_url_host = self._get_url_host_list()
|
||||
# Get text searches for URL hosts
|
||||
list_search_text_url_host = ["site:{}".format(l) for l in list_url_host]
|
||||
|
||||
MULTITHREADED = False
|
||||
if MULTITHREADED:
|
||||
# Run fetching
|
||||
list_fetching_threads = []
|
||||
for search_text in list_search_text_of_interest + list_search_text_url_host:
|
||||
logger.debug("Fetching news for search: {}".format(search_text))
|
||||
# Append thread
|
||||
list_fetching_threads.append( Thread(target=self._run_fetching, args=(search_text, )) )
|
||||
|
||||
# Run
|
||||
for t in list_fetching_threads:
|
||||
t.start()
|
||||
# Join
|
||||
for t in list_fetching_threads:
|
||||
t.join()
|
||||
else:
|
||||
for search_text in list_search_text_of_interest + list_search_text_url_host:
|
||||
logger.debug("Fetching news for search: {}".format(search_text))
|
||||
self._run_fetching(search_text)
|
||||
|
||||
logger.info("Finished fetching text searches & URL hosts of interest")
|
||||
except Exception as e:
|
||||
logger.warning("Exception in NewsSearch.run(): {}".format(str(e)))
|
||||
|
||||
321
app_fetcher/src/search_sources.py
Normal file
321
app_fetcher/src/search_sources.py
Normal file
@@ -0,0 +1,321 @@
|
||||
from duckduckgo_search import DDGS
|
||||
from gnews import GNews
|
||||
from GoogleNews import GoogleNews
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
import numpy as np
|
||||
import random
|
||||
from .user_agents import user_agents_list
|
||||
from .google_bypass import GoogleByPass
|
||||
from abc import ABC, abstractmethod
|
||||
import logging
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
|
||||
|
||||
|
||||
# Generic fetcher (fetches articles, writes to DB)
|
||||
class FetcherAbstract(ABC):
|
||||
@abstractmethod
|
||||
def _fetch(self):
|
||||
pass
|
||||
|
||||
def fetch_articles(self, db_writer):
|
||||
logger.debug("Starting fetch() for {}".format(self.name))
|
||||
# Fetch articles
|
||||
list_news = self._fetch()
|
||||
logger.info("Found #{} articles for search: {}".format(len(list_news), self.name))
|
||||
# Write to DB
|
||||
db_writer.write_batch(list_news, self.name)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class FetcherPreSearch(FetcherAbstract):
|
||||
def __init__(self, search):
|
||||
"""
|
||||
# period ->
|
||||
- h = hours (eg: 12h)
|
||||
- d = days (eg: 7d)
|
||||
- m = months (eg: 6m)
|
||||
- y = years (eg: 1y)
|
||||
"""
|
||||
self.search = search
|
||||
self.period = "1d" # TODO Fixed for the moment
|
||||
# self.lang = lang
|
||||
# self.region = region
|
||||
search_category = "news"
|
||||
self.name = "presearch {} {} {}".format(search, search_category, self.period)
|
||||
|
||||
def _fetch(self):
|
||||
try:
|
||||
# PreSearch fetching endpoint, parameter search keyword
|
||||
presearch_fetch_endpoint = "http://selenium_app:80/fetch_presearch/?search_keyword={}".format(self.search)
|
||||
# Timeout: 15 minutes
|
||||
r = requests.get(presearch_fetch_endpoint, timeout=900)
|
||||
# Decode
|
||||
list_news = json.loads(r.text).get("list_urls", [])
|
||||
except Exception as e:
|
||||
logger.warning("Timeout on request: {}. {}".format(presearch_fetch_endpoint, str(e)))
|
||||
list_news = []
|
||||
return list_news
|
||||
|
||||
|
||||
|
||||
class FetcherGNews(FetcherAbstract):
|
||||
def __init__(self, search, period, lang="en", region="US"):
|
||||
"""
|
||||
# period ->
|
||||
- h = hours (eg: 12h)
|
||||
- d = days (eg: 7d)
|
||||
- m = months (eg: 6m)
|
||||
- y = years (eg: 1y)
|
||||
"""
|
||||
self.search = search
|
||||
self.period = period
|
||||
self.lang = lang
|
||||
self.region = region
|
||||
search_category = "news"
|
||||
self.name = "gnews {} {} {} {}".format(search, search_category, period, "{}-{}".format(lang, region))
|
||||
|
||||
def _fetch(self):
|
||||
try:
|
||||
list_dict_news = GNews(self.lang, self.region, period=self.period).get_news(self.search)
|
||||
# Decode
|
||||
list_news = []
|
||||
for l in list_dict_news:
|
||||
list_news.append(l.get("url"))
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
|
||||
list_news = []
|
||||
|
||||
# Bypass Google links
|
||||
list_news_redirections = GoogleByPass().bypass_google_urls(list_news)
|
||||
|
||||
return list_news_redirections
|
||||
|
||||
class FetcherGoogleNews(FetcherAbstract):
|
||||
def __init__(self, search, search_category="news", period="1d", lang="en", region="US"):
|
||||
assert(search_category in ["news", "general"])
|
||||
|
||||
self.lang = lang
|
||||
self.region = region
|
||||
self.period = period
|
||||
self.search_category = search_category
|
||||
self.search = search
|
||||
self.name = "googlenews {} {} {} {}".format(search, search_category, period, "{}-{}".format(lang, region))
|
||||
|
||||
def _fetch(self):
|
||||
try:
|
||||
# Initialize
|
||||
g = GoogleNews(encode="utf-8", period=self.period, lang=self.lang, region=self.region)
|
||||
g.enableException(True)
|
||||
|
||||
if (self.search_category == "general"):
|
||||
set_links = set()
|
||||
# Search
|
||||
g.search(self.search)
|
||||
|
||||
# Iterate pages
|
||||
MAX_ITER_PAGES = 15
|
||||
for i in range(MAX_ITER_PAGES):
|
||||
time.sleep(random.uniform(1, 1.5))
|
||||
num_before = len(set_links)
|
||||
|
||||
# Get page
|
||||
try:
|
||||
links = g.page_at(i)
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching page in GoogleNews {}: {}".format(self.name, str(e)))
|
||||
break
|
||||
# Links
|
||||
for l in links:
|
||||
# '/url?esrc=s&q=&rct=j&sa=U&url=https://www.breitbart.com/news/scent-of-luxury-indias-jasmine-infuses-global-perfume/&ved=2ahUKEwjOybGSiN-AAxX1gv0HHfqSBpMQxfQBegQICBAC&usg=AOvVaw06GdoHyzPbIopUaEuUSQPQ'
|
||||
url = l.get("link").split("url=")[-1]
|
||||
set_links.add(url)
|
||||
|
||||
num_after = len(set_links)
|
||||
|
||||
# Finished?
|
||||
if (num_before == num_after):
|
||||
logger.debug("Iterated {} pages on GoogleNews general search".format(i))
|
||||
break
|
||||
# To list
|
||||
list_news = list(set_links)
|
||||
elif (self.search_category == "news"):
|
||||
# Search
|
||||
g.get_news(self.search)
|
||||
# Fetch
|
||||
list_news = g.get_links()
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
|
||||
list_news = []
|
||||
|
||||
# Bypass Google links
|
||||
list_news_redirections = GoogleByPass().bypass_google_urls(list_news)
|
||||
|
||||
return list_news_redirections
|
||||
|
||||
class FetcherDuckDuckGo(FetcherAbstract):
|
||||
def __init__(self, search, search_category, period, lang="wt", region="wt"):
|
||||
assert(search_category in ["news", "general"])
|
||||
assert(period in ["d", "w", "m", "y"])
|
||||
self.search = search
|
||||
self.search_category = search_category
|
||||
self.period = period
|
||||
self.lang_region = "{}-{}".format(lang, region)
|
||||
self.name = "duckduckgo {} {} {} {}".format(search, search_category, "1{}".format(period), region)
|
||||
|
||||
def _fetch(self):
|
||||
try:
|
||||
list_news = []
|
||||
with DDGS(timeout=10) as ddgs:
|
||||
if (self.search_category == "general"):
|
||||
generator_links = ddgs.text(keywords=self.search, timelimit=self.period, region=self.lang_region)
|
||||
elif (self.search_category == "news"):
|
||||
generator_links = ddgs.news(keywords=self.search, timelimit=self.period, region=self.lang_region)
|
||||
|
||||
for l in generator_links:
|
||||
list_news.append( l.get("url", l.get("href")) )
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
|
||||
list_news = []
|
||||
return list_news
|
||||
|
||||
|
||||
class FetcherSearxNews(FetcherAbstract):
|
||||
def __init__(self, search="child abuse", searx_instance="https://serx.ml/", lang="en", region="US", search_category="news", period="day"):
|
||||
assert(search_category in ["news", "general"])
|
||||
assert(period in [None, "day", "week", "month", "year"])
|
||||
# Random header (minimize prob of web-scrapping detection)
|
||||
self.headers = {
|
||||
'User-agent': str(np.random.choice(user_agents_list)),
|
||||
'Accept-Encoding': 'gzip, deflate',
|
||||
'Accept': '*/*',
|
||||
'Connection': 'keep-alive',
|
||||
}
|
||||
""" # Optional header
|
||||
self.headers = {
|
||||
'User-agent': str(np.random.choice(user_agents_list)),
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'TE': 'trailers',
|
||||
'Sec-Fetch-Site': 'cross-site',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
}
|
||||
"""
|
||||
self.search = search
|
||||
self.searx_instance = searx_instance
|
||||
self.lang_region = "{}-{}".format(lang, region)
|
||||
self.search_category = search_category
|
||||
self.period = period
|
||||
self.t_sleep_lower, self.t_sleep_higher = 0.5, 1.5
|
||||
self.request_timeout = 240
|
||||
|
||||
period_name_mapping = {
|
||||
None: "no_date_range",
|
||||
"day": "1d",
|
||||
"week": "1w",
|
||||
"month": "1m",
|
||||
"year": "1y",
|
||||
}
|
||||
self.name = "searxng {} {} {} {} {}".format(searx_instance.replace("https://", "").replace("/", ""), search, search_category, period_name_mapping[period], self.lang_region)
|
||||
logger.info("SearX - Initialized SearX fetcher: {}".format(self.name))
|
||||
|
||||
def _request_and_decode(self, url_search):
|
||||
# Initial random time sleep (minimize chance of getting blocked)
|
||||
time.sleep(random.uniform(self.t_sleep_lower, self.t_sleep_higher))
|
||||
# Request
|
||||
logger.debug("SearX - Searching: {}".format(url_search))
|
||||
try:
|
||||
r = requests.get(url_search, headers=self.headers, timeout=self.request_timeout)
|
||||
except Exception as e:
|
||||
logger.warning("SearX - Exception in request: {}".format(url_search), "\n", str(e))
|
||||
return []
|
||||
|
||||
if (r.status_code == 200):
|
||||
# Status code Ok
|
||||
pass
|
||||
elif (r.status_code == 429):
|
||||
# TooManyRequests, "Rate limit exceeded"
|
||||
logger.warning("SearX {} - Too many requests while running: {}. Request output: {}".format(self.name, r.url, r.text))
|
||||
return []
|
||||
elif (r.status_code != 200):
|
||||
logger.warning("SearX {} - Status code: {}. Request output: {}".format(self.name, r.status_code, r.text))
|
||||
return []
|
||||
else:
|
||||
logger.debug("SearX - Status code: {}".format(r.status_code))
|
||||
|
||||
# Decode request
|
||||
soup = BeautifulSoup(r.text, 'html.parser')
|
||||
page_url_set = set()
|
||||
# h3 links
|
||||
for elem in soup.find_all('h3'):
|
||||
# Get url
|
||||
url = elem.find('a').get('href')
|
||||
page_url_set.add(url)
|
||||
return page_url_set
|
||||
|
||||
def _get_news_list(self):
|
||||
############################################################
|
||||
# Domain & search parameter
|
||||
search_domain = os.path.join(self.searx_instance, "search?q=")
|
||||
# Search keywords
|
||||
search_formatted = self.search.replace(" ", "+").replace(":", "%3A")
|
||||
# Period formatted
|
||||
period_formatted = "&time_range={}".format(self.period) if self.period is not None else ""
|
||||
# Search parameters
|
||||
search_parameters = "&category_{}=on&language={}{}".format(self.search_category, self.lang_region, period_formatted)
|
||||
# Combined url search
|
||||
url_search_nopage = "{}{}{}".format(search_domain, search_formatted, search_parameters)
|
||||
############################################################
|
||||
|
||||
# Request and decode on page=1
|
||||
url_set = self._request_and_decode(url_search_nopage)
|
||||
# No results?
|
||||
if (len(url_set) == 0):
|
||||
logger.warning("SearX {} - Empty results on search: {}".format(self.name, url_search_nopage))
|
||||
return []
|
||||
|
||||
# Iterate pages
|
||||
search_numpage = 2
|
||||
while True:
|
||||
# Combine url search with page number
|
||||
url_search_with_page = "{}&pageno={}".format(url_search_nopage, search_numpage)
|
||||
# Request and decode on page=X
|
||||
url_set_i = self._request_and_decode(url_search_with_page)
|
||||
|
||||
# Length before merging
|
||||
length_current = len(url_set)
|
||||
# Merge
|
||||
url_set = url_set.union(url_set_i)
|
||||
# Length after merging
|
||||
length_merged = len(url_set)
|
||||
|
||||
# No new elements?
|
||||
if (length_current == length_merged):
|
||||
logger.debug("SearX {} - Finished processing search, #pages: {}".format(self.name, search_numpage))
|
||||
break
|
||||
# Next page
|
||||
search_numpage += 1
|
||||
|
||||
return list(url_set)
|
||||
|
||||
def _fetch(self):
|
||||
try:
|
||||
# Fetch news
|
||||
list_news = self._get_news_list()
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
|
||||
list_news = []
|
||||
return list_news
|
||||
63
app_fetcher/src/url_status.py
Normal file
63
app_fetcher/src/url_status.py
Normal file
@@ -0,0 +1,63 @@
|
||||
from .db_utils import URL_DB_Writer
|
||||
from .url_utils import process_article
|
||||
import logging
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
|
||||
class UpdateErrorURLs():
|
||||
def __init__(self, db_connect_info, redis_connect_info, num_urls) -> None:
|
||||
self.num_urls = num_urls
|
||||
self.db_connect_info = db_connect_info
|
||||
self.redis_connect_info = redis_connect_info
|
||||
self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info)
|
||||
|
||||
def update_error_urls_status(self):
|
||||
try:
|
||||
logger.info("Starting updating status to URLs with error, limit #URLs: {}".format(self.num_urls))
|
||||
# List of URLs with status 'error'
|
||||
list_ids_and_urls = self.db_writer._get_error_urls(self.num_urls)
|
||||
# Current status
|
||||
current_status = "error"
|
||||
# Dict: status -> IDs to update to new status
|
||||
dict_status_ids, dict_status_urls = {}, {}
|
||||
|
||||
# Get list of (pattern, priority, status) tuples to override status if required
|
||||
list_pattern_status_tuple = self.db_writer._get_pattern_status_list()
|
||||
# Sort pattern tuples by priority
|
||||
list_pattern_status_tuple.sort(key=lambda tup: tup[1], reverse=True)
|
||||
|
||||
flush_every, flush_current = 20, 0
|
||||
# Iterate URLs
|
||||
for (id, url) in list_ids_and_urls:
|
||||
# Get status
|
||||
url_canonical, article_elements, new_status = process_article(url, list_pattern_status_tuple)
|
||||
# Different? Update
|
||||
if (current_status != new_status):
|
||||
# Extend array
|
||||
dict_status_ids[new_status] = dict_status_ids.get(new_status, []) + [id]
|
||||
# Debugging dict
|
||||
dict_status_urls[new_status] = dict_status_urls.get(new_status, []) + [url]
|
||||
# +1 processed
|
||||
flush_current += 1
|
||||
|
||||
# Flush batch?
|
||||
if (flush_every == flush_current):
|
||||
logger.info("Updating status to URLs with error: {}".format(dict_status_urls))
|
||||
# Update DB
|
||||
self.db_writer._update_urls_status(dict_status_ids)
|
||||
# Reset
|
||||
flush_current = 0
|
||||
dict_status_ids, dict_status_urls = {}, {}
|
||||
|
||||
# Flush remaining batch
|
||||
if (flush_current > 0):
|
||||
logger.info("Updating status to URLs with error: {}".format(dict_status_urls))
|
||||
# Update DB
|
||||
self.db_writer._update_urls_status(dict_status_ids)
|
||||
# Reset
|
||||
flush_current = 0
|
||||
dict_status_ids, dict_status_urls = {}, {}
|
||||
|
||||
logger.info("Finished updating status to URLs with error")
|
||||
except Exception as e:
|
||||
logger.warning("Exception in UpdateErrorURLs.run(): {}".format(str(e)))
|
||||
289
app_fetcher/src/url_utils.py
Normal file
289
app_fetcher/src/url_utils.py
Normal file
@@ -0,0 +1,289 @@
|
||||
from gnews import GNews
|
||||
import dateutil.parser
|
||||
from datetime import datetime, timedelta
|
||||
from .utils import remove_http_s
|
||||
import time
|
||||
import random
|
||||
import traceback
|
||||
import requests
|
||||
import json
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
import logging
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
|
||||
def get_published_date(article):
|
||||
try:
|
||||
"""
|
||||
# Already fetched publish date information?
|
||||
if (publish_date_ is not None):
|
||||
return publish_date_
|
||||
"""
|
||||
|
||||
# List of potential publish dates
|
||||
potential_dates = []
|
||||
# Publish date is the best match
|
||||
potential_dates.append(article.publish_date)
|
||||
# Publish date metadata is the following best match
|
||||
potential_dates.append(article.meta_data.get('article', {}).get("published_time", None))
|
||||
# Iterate remaining keys
|
||||
for key in article.meta_data.keys():
|
||||
if ("date" in key):
|
||||
potential_dates.append(article.meta_data[key])
|
||||
|
||||
def invalid_date(p_date):
|
||||
# Today + 2 days, article from the future?
|
||||
today_plus_two = datetime.utcnow() + timedelta(days=2)
|
||||
# Article from the future?
|
||||
return p_date.timestamp() > today_plus_two.timestamp()
|
||||
|
||||
for date_ in potential_dates:
|
||||
# String date? parse
|
||||
if (type(date_) == str):
|
||||
try:
|
||||
date_ = dateutil.parser.parse(date_)
|
||||
except Exception as e:
|
||||
logger.info("Invalid date found while parsing potential date: {} for URL: {}".format(date_, article.url))
|
||||
date_ = None
|
||||
# Valid?
|
||||
if (date_ is not None) and (not invalid_date(date_)):
|
||||
return date_
|
||||
|
||||
logger.debug("Article with no published date: {}".format(article.url))
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.info("Error while retrieving published date for URL: {}".format(article.url))
|
||||
return None
|
||||
|
||||
def get_url_host(article_source_url, url):
|
||||
# https://www.blabla.com/blabla -> www.blabla.com
|
||||
if (article_source_url != ""):
|
||||
# Article source URL already extracted, save path if any
|
||||
return remove_http_s(article_source_url) # .split("/")[0]
|
||||
else:
|
||||
return remove_http_s(url).split("/")[0]
|
||||
|
||||
def get_status_pattern_matching(url, article_status, list_pattern_status_tuple):
|
||||
# Regex pattern to update status on "valid", "invalid", and "unknown" status only
|
||||
# Status "raw", "duplicated" and "error" should remain the way they are
|
||||
# Assumption: List of patterns sorted by importance
|
||||
if (article_status in ["valid", "invalid", "unknown"]):
|
||||
# Regular expression pattern matching: https://regexr.com/
|
||||
for regex_pattern, regex_priority, status_if_match in list_pattern_status_tuple:
|
||||
# Matching?
|
||||
matching = bool(re.match(regex_pattern, url))
|
||||
# Update article status
|
||||
if (matching):
|
||||
if (status_if_match != article_status):
|
||||
logger.debug("Regex pattern found, updating status from '{}' to '{}' for URL: {}".format(article_status, status_if_match, url))
|
||||
return status_if_match
|
||||
# Pattern matching not required or not found, original article status
|
||||
return article_status
|
||||
|
||||
def get_missing_kid_status(url, return_canonical_url=False):
|
||||
# Sleep
|
||||
time.sleep(0.75)
|
||||
try:
|
||||
# Request
|
||||
r = requests.get(url, timeout=300)
|
||||
# Decode
|
||||
status_code = r.status_code
|
||||
# Canonical URL removing parameters
|
||||
url_canonical = r.url
|
||||
except Exception as e:
|
||||
logger.warning("Exception on get URL status request: {}. {}".format(url, str(e)))
|
||||
status_code = None
|
||||
url_canonical = url
|
||||
|
||||
if (status_code == 200):
|
||||
status = "valid"
|
||||
elif (status_code == 404):
|
||||
status = "invalid"
|
||||
else:
|
||||
status = "unknown"
|
||||
|
||||
logger.debug("Missing Kid URL {} status: {}".format(url, status))
|
||||
if (return_canonical_url):
|
||||
return status, url_canonical
|
||||
else:
|
||||
return status
|
||||
|
||||
def bypass_google_link(article_url):
|
||||
|
||||
def bypass_google_consent(article_url):
|
||||
# Sample URL: https://consent.google.com/m?continue=https://news.google.com/rss/articles/CBMiMGh0dHBzOi8vd3d3Lm1pc3NpbmdraWRzLm9yZy9wb3N0ZXIvbmNtYy84NjAxMTkvMdIBAA?oc%3D5&gl=NL&m=0&pc=n&cm=2&hl=en-US&src=1
|
||||
article_url_no_consent = article_url.replace("https://consent.google.com/m?continue=", "")
|
||||
|
||||
# https://stackoverflow.com/questions/76063646/how-can-i-have-redirection-link-from-google-news-link-using-requests
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
|
||||
}
|
||||
cookies = {'CONSENT': 'YES+cb.20220419-08-p0.cs+FX+111'}
|
||||
|
||||
try:
|
||||
# Request
|
||||
r = requests.get(article_url_no_consent, headers=headers, cookies=cookies, timeout=300)
|
||||
# Decode
|
||||
soup = BeautifulSoup(r.text, 'html.parser')
|
||||
url_of_interest = soup.a['href']
|
||||
except Exception as e:
|
||||
logger.warning("Exception on request trying to G_bypass with headers: {}. {}".format(article_url_no_consent, str(e)))
|
||||
url_of_interest = None
|
||||
|
||||
# Not able to bypass?
|
||||
if (url_of_interest == "") or ("support.google.com" in url_of_interest) or ("news.google.com" in url_of_interest):
|
||||
url_of_interest = None
|
||||
return url_of_interest
|
||||
|
||||
def bypass_google_using_service(article_url):
|
||||
try:
|
||||
# e.g.: url = "https://news.google.com/articles/CBMiX2h0dHBzOi8vd3d3LmZveGJ1c2luZXNzLmNvbS9wb2xpdGljcy9kaXNuZXktc3Vlcy1mbG9yaWRhLWdvdi1yb24tZGVzYW50aXMtbG9zcy1zcGVjaWFsLWRpc3RyaWN00gEA?hl=en-US&gl=US&ceid=US%3Aen"
|
||||
gbypass_endpoint = "http://selenium_app:80/get_redirection"
|
||||
# Timeout: 5 minutes
|
||||
r = requests.post(gbypass_endpoint, json={"url": article_url}, timeout=300)
|
||||
# Decode
|
||||
redirect_url = json.loads(r.text).get("redirect_url", "")
|
||||
except Exception as e:
|
||||
logger.warning("Exception on request: {}. {}".format(gbypass_endpoint, str(e)))
|
||||
redirect_url = ""
|
||||
|
||||
return redirect_url
|
||||
|
||||
logger.debug("Starting gbypass_endpoint()")
|
||||
|
||||
article_url_bypassed = None
|
||||
# Bypass using request
|
||||
if ("consent.google.com" in article_url):
|
||||
article_url_bypassed = bypass_google_consent(article_url)
|
||||
# Not bypassed yet? Bypass using service
|
||||
if (article_url_bypassed is None):
|
||||
article_url_bypassed = bypass_google_using_service(article_url)
|
||||
|
||||
# if (article_url_bypassed is None) or (article_url_bypassed == "") or ("news.google.com" in article_url_bypassed):
|
||||
if (article_url_bypassed == "") or (article_url_bypassed is None):
|
||||
# Empty URL returned by Gbypass
|
||||
logger.warning("Error while bypassing Gnews for URL: {}".format(article_url))
|
||||
return None
|
||||
else:
|
||||
logger.debug("Correctly bypassed GNews to URL_redirect, from URL: {} {}".format(article_url_bypassed, article_url))
|
||||
return article_url_bypassed
|
||||
|
||||
def process_article(article_url, list_pattern_status_tuple, language="en"):
|
||||
# TODO:
|
||||
"""
|
||||
https://github.com/fhamborg/news-please
|
||||
https://github.com/fhamborg/Giveme5W1H
|
||||
https://github.com/santhoshse7en/news-fetch
|
||||
"""
|
||||
try:
|
||||
logger.debug("Starting process_article()")
|
||||
|
||||
if ("news.google.com" in article_url) or ("consent.google.com" in article_url):
|
||||
# Bypass to get redirection
|
||||
article_url = bypass_google_link(article_url)
|
||||
# Error?
|
||||
if (article_url is None):
|
||||
return None, {}, "error"
|
||||
elif ("missingkids.org/poster" in article_url):
|
||||
# Get status
|
||||
article_status, url_canonical = get_missing_kid_status(article_url, return_canonical_url=True)
|
||||
article_elements = {
|
||||
"url_full": article_url,
|
||||
"url_canonical": url_canonical
|
||||
}
|
||||
return url_canonical, article_elements, article_status
|
||||
else:
|
||||
# Avoid Too many requests (feeds, ...)
|
||||
time.sleep(0.75)
|
||||
|
||||
logger.debug("Processing: {}".format(article_url))
|
||||
|
||||
# Default status unless something happens
|
||||
article_status = "valid"
|
||||
|
||||
# Parse article
|
||||
# TODO: :param proxy: The proxy parameter is a dictionary with a single key-value pair. self._proxy = {'http': proxy, 'https': proxy} if proxy else None
|
||||
# TODO: Language per config
|
||||
article = GNews(language).get_full_article(url=article_url)
|
||||
|
||||
# Article parsed?
|
||||
if (article is None) or (not article.is_parsed):
|
||||
logger.debug("Article not parsed: {}".format(article_url))
|
||||
return article_url, {}, "error"
|
||||
|
||||
# Canonical link as main URL
|
||||
url_canonical = article.canonical_link
|
||||
# Empty canonical URL?
|
||||
if (article.canonical_link is None) or (article.canonical_link == ""):
|
||||
# URL with parameters? e.g. some zerohedge news fetched from newspaper3k end with #comment-stream -> Remove extra parameter in link
|
||||
if ("?" in article.url) or (article.url.endswith("#comment-stream")) or (article.url.endswith("#disqus_thread")):
|
||||
logger.debug("Article URL contains parameters, trying to clean URL: {}".format(article.url))
|
||||
try:
|
||||
# Remove text after parameter call
|
||||
url = article.url.split("?")[0]
|
||||
# Remove comment-stream
|
||||
url = url.replace("#comment-stream", "").replace("#disqus_thread", "")
|
||||
# Article
|
||||
article_attempt = GNews(language).get_full_article(url=url)
|
||||
# Retrieving same title? Update article based on clean URL
|
||||
if (article_attempt is not None) and (article_attempt.title == article.title):
|
||||
article = article_attempt
|
||||
except Exception as e:
|
||||
logger.info("Article parsing of URL without parameters failed: {}".format(article.url))
|
||||
else: # Default behaviour
|
||||
logger.debug("Article canonical link is empty, assuming URL=URL_CANONICAL: {}".format(article.url))
|
||||
|
||||
# By default, URL same as canonical
|
||||
url_canonical = article.url
|
||||
|
||||
elif (article.url != article.canonical_link):
|
||||
# If different, stick to canonical URL
|
||||
logger.debug("Article URL and canonical link are different: {} {}".format(article.url, article.canonical_link))
|
||||
else:
|
||||
# If same, continue...
|
||||
pass
|
||||
|
||||
# Update config to determine if content is valid
|
||||
article.config.MIN_WORD_COUNT = 150
|
||||
article.config.MIN_SENT_COUNT = 6
|
||||
|
||||
# Valid URL?
|
||||
if (not article.is_valid_url()):
|
||||
logger.debug("Not a valid news article: {}".format(url_canonical))
|
||||
article_status = "invalid"
|
||||
# Is the article's body text is long enough to meet standard article requirements?
|
||||
if (not article.is_valid_body()):
|
||||
logger.debug("Article body not valid: {}".format(url_canonical))
|
||||
article_status = "unknown"
|
||||
|
||||
if (article.images != article.imgs):
|
||||
logger.debug("Article images and imgs are different: {} {}".format(article.images, article.imgs))
|
||||
|
||||
# article.keywords, article.meta_keywords, article.summary
|
||||
# article.movies
|
||||
# article.top_image
|
||||
|
||||
# Check if article status needs to be updated
|
||||
article_status = get_status_pattern_matching(url_canonical, article_status, list_pattern_status_tuple)
|
||||
|
||||
article_elements = {
|
||||
'url_full': article.url, # https://www.breitbart.com/tech/2022/10/03/report-election-integrity-project-worked-with-feds-to-censor-news-sites-in-2020/
|
||||
'url_host': get_url_host(article.source_url, url_canonical), # www.breitbart.com
|
||||
'title': article.title, # Report: ‘Election Integrity’ Partnership Worked with Feds to Censor News Sites in 2020
|
||||
'description': article.meta_description, # Coalition committed to respond in ‘early 2022’ but failed to do so, while Labor has not issued a full response since taking office
|
||||
'text': article.text, # ${Article content}
|
||||
'published_date': get_published_date(article), # python.datetime format, obtained from "YYYY-MM-DD" or '2022-10-03T20:54:17+00:00'
|
||||
'authors': article.authors, # ['Christopher Knaus']
|
||||
'language': article.meta_lang, # en
|
||||
'tags': list(article.tags), # ['Wide Open Border', '’My Son Hunter’ Movie', ...]
|
||||
'images': list(article.images), # [URL_IMAGE_1, URL_IMAGE_2, ...]
|
||||
'url_canonical': url_canonical, # Canonical URL (redirection)
|
||||
# 'html': article.html, # HTML article
|
||||
}
|
||||
logger.debug("Processing OK: {}".format(url_canonical))
|
||||
return url_canonical, article_elements, article_status
|
||||
except Exception as e:
|
||||
logger.warning("Exception processing url: {}\n{}".format(article_url, traceback.format_exc()))
|
||||
return None, {}, "error"
|
||||
64
app_fetcher/src/user_agents.py
Normal file
64
app_fetcher/src/user_agents.py
Normal file
@@ -0,0 +1,64 @@
|
||||
# https://techblog.willshouse.com/2012/01/03/most-common-user-agents/
|
||||
|
||||
user_agents_list = [
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/111.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; rv:111.0) Gecko/20100101 Firefox/111.0",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0",
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.44",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 OPR/96.0.0.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 OPR/97.0.0.0",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48",
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.34",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.39",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.2 Safari/605.1.15",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; rv:112.0) Gecko/20100101 Firefox/112.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.51",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/112.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/110.0",
|
||||
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0",
|
||||
"Mozilla/5.0 (X11; CrOS x86_64 14541.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 YaBrowser/23.3.0.2246 Yowser/2.5 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.6.1 Safari/605.1.15",
|
||||
"Mozilla/5.0 (Windows NT 6.1; rv:102.0) Gecko/20100101 Goanna/6.0 Firefox/102.0 PaleMoon/32.0.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.41",
|
||||
"Mozilla/5.0 (Windows NT 10.0; rv:110.0) Gecko/20100101 Firefox/110.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 YaBrowser/23.1.5.708 Yowser/2.5 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0",
|
||||
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
|
||||
]
|
||||
33
app_fetcher/src/utils.py
Normal file
33
app_fetcher/src/utils.py
Normal file
@@ -0,0 +1,33 @@
|
||||
|
||||
def remove_http_s(url):
|
||||
url = url.replace("https://", "") if url.startswith("https://") else url
|
||||
url = url.replace("http://", "") if url.startswith("http://") else url
|
||||
return url
|
||||
|
||||
def is_valid_url(url):
|
||||
if (url.startswith("https://")):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def get_searxng_instances():
|
||||
# SearxNG instances: https://searx.space/
|
||||
searx_instances = set()
|
||||
searx_instances.add("https://searx.work/")
|
||||
searx_instances.add("https://search.ononoki.org/")
|
||||
searx_instances.add("https://searxng.nicfab.eu/")
|
||||
searx_instances.add("https://searx.be/")
|
||||
|
||||
# searx_instances.add("https://searx.fmac.xyz/")
|
||||
# searx_instances.add("https://northboot.xyz/") # FIX
|
||||
|
||||
# searx_instances.add("https://serx.ml/") # Offline
|
||||
# searx_instances.add("https://searx.ru/")
|
||||
# searx_instances.add("https://searx.sp-codes.de/")
|
||||
# searx_instances.add("https://searxng.nicfab.eu/")
|
||||
# searx_instances.add("https://s.frlt.one/")
|
||||
# searx_instances.add("https://search.sapti.me/")
|
||||
|
||||
# To list
|
||||
list_searx_instances = list(searx_instances)
|
||||
return list_searx_instances
|
||||
@@ -1,4 +1,5 @@
|
||||
from django.db import models
|
||||
from django.contrib.postgres.fields import ArrayField
|
||||
|
||||
# Create your models here.
|
||||
class Urls(models.Model):
|
||||
@@ -44,3 +45,17 @@ class UrlsSource(models.Model):
|
||||
managed = False
|
||||
db_table = 'urls_source'
|
||||
unique_together = (('id_url', 'id_source'),)
|
||||
|
||||
class UrlContent(models.Model):
|
||||
id_url = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url', primary_key=True)
|
||||
date_published = models.DateTimeField(blank=True, null=True)
|
||||
title = models.TextField(blank=True, null=True)
|
||||
description = models.TextField(blank=True, null=True)
|
||||
content = models.TextField(blank=True, null=True)
|
||||
tags = ArrayField(models.TextField(blank=True, null=True))
|
||||
authors = ArrayField(models.TextField(blank=True, null=True))
|
||||
image_urls = ArrayField(models.TextField(blank=True, null=True))
|
||||
|
||||
class Meta:
|
||||
managed = False
|
||||
db_table = 'url_content'
|
||||
@@ -14,7 +14,7 @@
|
||||
<tbody>
|
||||
{% for item in page_obj %}
|
||||
<tr>
|
||||
<td><a href="https://{{ item.url }}/">{{ item.url }}</a></td>
|
||||
<td><a href="{{ item.url }}/" target="_blank">{{ item.url }}</a></td>
|
||||
<td>{{ item.ts_fetch }}</td>
|
||||
<td>
|
||||
{% with sources_map|dict_get:item.id as sources %}
|
||||
@@ -131,7 +131,7 @@
|
||||
<table class="table table-bordered">
|
||||
<tr>
|
||||
<th>URL</th>
|
||||
<td>{{ url_item.url }}</td>
|
||||
<td><a href="{{ url_item.url }}" target="_blank">{{ url_item.url }}</a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Fetch Date</th>
|
||||
@@ -145,9 +145,32 @@
|
||||
<th>Status</th>
|
||||
<td>{{ url_item.status }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Title</th>
|
||||
<td>{{ url_content.title }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Description</th>
|
||||
<td>{{ url_content.description }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Content</th>
|
||||
<td>{{ url_content.content }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Tags</th>
|
||||
<td>{{ url_content.tags }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Authors</th>
|
||||
<td>{{ url_content.authors }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Image URLs</th>
|
||||
<td>{{ url_content.image_urls }}</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
|
||||
<!-- Independent form for optional values -->
|
||||
<form onsubmit="fetchDetailsWithSelection(event, {{ url_item.id }}, '{{ url_item.url }}')">
|
||||
<label for="options-{{ url_item.id }}">Model:</label>
|
||||
@@ -7,7 +7,7 @@ import json
|
||||
import time
|
||||
import ollama
|
||||
|
||||
from .models import Urls, Source, UrlsSource
|
||||
from .models import Urls, Source, UrlsSource, UrlContent
|
||||
|
||||
# Create your views here.
|
||||
def index(request):
|
||||
@@ -60,19 +60,27 @@ def news(request):
|
||||
def url_detail_view(request, id):
|
||||
url_item = get_object_or_404(Urls, id=id)
|
||||
url_sources = list(Source.objects.filter(urlssource__id_url=url_item).values_list('source', flat=True))
|
||||
try:
|
||||
url_content = UrlContent.objects.get(pk=id)
|
||||
except UrlContent.DoesNotExist:
|
||||
url_content = {}
|
||||
|
||||
#print(url_content.__dict__)
|
||||
|
||||
# TODO: https://github.com/ollama/ollama-python?tab=readme-ov-file#async-client
|
||||
# LLM models available
|
||||
client = ollama.Client(host = 'https://ollamamodel.matitos.org')
|
||||
models = [m.model for m in client.list().models]
|
||||
models = sorted([m.model for m in client.list().models])
|
||||
print(models)
|
||||
|
||||
context = {
|
||||
'url_item': url_item,
|
||||
'sources': url_sources,
|
||||
'models': models,
|
||||
"prompt": "Provide in one paragraph the what, why, when, where, who, and how of the content below. Also provide a one paragraph summary of the content:",
|
||||
'prompt': "Provide in one paragraph the what, why, when, where, who, and how of the content below. Also provide a one paragraph summary of the content:",
|
||||
#"prompt": "Image you are a journalist, TLDR in a paragraph:",
|
||||
#"prompt": "Below you will find the whole content of a news article:\n{}\nProvide a concise summary of one paragraph maximum of the content.".format(content)
|
||||
'url_content': url_content,
|
||||
}
|
||||
return render(request, 'url_detail.html', context)
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user