General search fix, status pattern match regex, find feeds on startup
This commit is contained in:
36
README.md
36
README.md
@@ -4,7 +4,7 @@
|
||||
- Fetch parsing URL host
|
||||
- Fetch from RSS feed
|
||||
- Fetch keyword search (Google search & news, DuckDuckGo, ...)
|
||||
++ Sources -> Robustness to TooManyRequests block
|
||||
- TODO: More sources -> Robustness to TooManyRequests block
|
||||
- Selenium based
|
||||
- Sites change their logic, request captcha, ...
|
||||
- Brave Search API
|
||||
@@ -12,22 +12,32 @@
|
||||
- Bing API
|
||||
- Subscription required
|
||||
- Yandex. No API?
|
||||
++ Proxy / VPN?
|
||||
TooManyRequests, ...
|
||||
++ Search per locale (nl-NL, fr-FR, en-GB)
|
||||
- TODO: Proxy / VPN?
|
||||
- TooManyRequests, ...
|
||||
- TODO: Search per locale (nl-NL, fr-FR, en-GB)
|
||||
|
||||
- URLs Processing -> Updates raw URLs
|
||||
- Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date
|
||||
- Determines if it is a valid article content
|
||||
++ Proxy / VPN?
|
||||
Bypass geoblock
|
||||
- TODO: Proxy / VPN?
|
||||
- Bypass geoblock and TooManyRequests
|
||||
|
||||
- Visualization of URLs
|
||||
- Filter URLs
|
||||
- By status, search, source, language, ...
|
||||
- By fetch date, status, search, source, language, has valid content, minimum amount of sources, ...
|
||||
- Charts
|
||||
|
||||
- Valid URLs
|
||||
- URLs selection
|
||||
- Published (or fetch) date during last_week / last 24 hrs
|
||||
- Language of interest
|
||||
- Valid content
|
||||
- Fetched by at least N sources
|
||||
- Use classifications and summaries
|
||||
- TODO: Manual inspection -> Improve automation
|
||||
- Rules or pattern for invalid articles, e.g. "youtube.com/*"
|
||||
- URL host with "priority" or "weight"
|
||||
|
||||
- Content generation
|
||||
- Generate summary
|
||||
- One paragraph
|
||||
- At most three paragraphs
|
||||
@@ -35,12 +45,4 @@
|
||||
- 5W: Who, What, When, Where, Why of a Story
|
||||
- Related to child abuse?
|
||||
- ...
|
||||
|
||||
- Content generation
|
||||
- URLs Selection
|
||||
- Valid content
|
||||
- Language of interest
|
||||
- Published (or fetch) date during last_week
|
||||
- Fetched by at least N sources
|
||||
- Use classifications and summaries
|
||||
- Merge summaries, ...
|
||||
- Merge similar articles?
|
||||
|
||||
@@ -7,12 +7,8 @@ logs_directory = os.getenv("PATH_LOGS_DIRECTORY", "logs")
|
||||
# Directory of logs
|
||||
os.makedirs(logs_directory, exist_ok=True)
|
||||
|
||||
# Too many logging entries otherwise
|
||||
logging.getLogger("requests").setLevel(logging.WARNING)
|
||||
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
||||
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("selenium")
|
||||
logger = logging.getLogger("app_selenium")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
# To file log: INFO / WARNING / ERROR / CRITICAL
|
||||
|
||||
@@ -39,7 +39,7 @@ class MissingKidsFetcher():
|
||||
logger.debug("Processing page: {}...".format(i))
|
||||
|
||||
try:
|
||||
time.sleep(int(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4))) #driver.implicitly_wait(3)
|
||||
time.sleep(float(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4))) #driver.implicitly_wait(3)
|
||||
# Fetch poster URLs
|
||||
for element_type in ["a"]: # ["a", "p", "div"]:
|
||||
for elem in driver.find_elements(By.TAG_NAME, element_type):
|
||||
@@ -75,7 +75,7 @@ class MissingKidsFetcher():
|
||||
logger.info(e.text)
|
||||
|
||||
# driver.refresh()
|
||||
time.sleep(int(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4)))
|
||||
time.sleep(float(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4)))
|
||||
|
||||
if (i == first_n_pages):
|
||||
continue_iterating = False
|
||||
@@ -86,6 +86,7 @@ class MissingKidsFetcher():
|
||||
logger.warning("Exception while fetching MissingKids {}".format(str(e)), exc_info=True)
|
||||
set_urls = set()
|
||||
|
||||
# Release memory
|
||||
try:
|
||||
driver.close()
|
||||
except Exception as e:
|
||||
|
||||
@@ -25,4 +25,4 @@ RUN chown -R appuser:appuser /opt
|
||||
USER appuser
|
||||
|
||||
# Run Django’s server & workers
|
||||
CMD ["sh", "-c", "/opt/app/script_initialize.sh && /opt/app/script_run.sh"]
|
||||
CMD ["sh", "-c", "/opt/app/initialize.sh && /opt/app/run.sh"]
|
||||
|
||||
@@ -67,6 +67,8 @@ class Meta:
|
||||
|
||||
* Database & initialization
|
||||
* Check initialize.sh on Dockerfile
|
||||
* init_data.json
|
||||
Insert URLs host of interest, RSS feeds, keyword searches, and Regex (escaped) status patterns to set "invalid" or "valid" URLs
|
||||
|
||||
* Environment variables
|
||||
* In docker-compose.yml
|
||||
|
||||
202
app_urls/db.py
202
app_urls/db.py
@@ -1,202 +0,0 @@
|
||||
import argparse
|
||||
import os
|
||||
import psycopg
|
||||
import re
|
||||
import time
|
||||
|
||||
connection_info = "host={} port={} dbname={} user={} password={} connect_timeout=60".format(
|
||||
os.environ.get("DB_HOST", "localhost"),
|
||||
os.environ.get("DB_PORT", "5432"),
|
||||
os.environ.get("DB_NAME", "matitos"),
|
||||
os.environ.get("DB_USER", "supermatitos"),
|
||||
os.environ.get("DB_PASSWORD", "supermatitos")
|
||||
)
|
||||
|
||||
def wait_connection():
|
||||
connected = False
|
||||
while (not connected):
|
||||
try:
|
||||
# Connect to an existing database
|
||||
with psycopg.connect(connection_info) as conn:
|
||||
# Open a cursor to perform database operations
|
||||
with conn.cursor() as cur:
|
||||
# Create URLs table
|
||||
c = cur.execute("SELECT 1;").fetchall()
|
||||
connected = True
|
||||
|
||||
except psycopg.OperationalError as e:
|
||||
# Connection not ready...
|
||||
# print(".", end="")
|
||||
time.sleep(2)
|
||||
except Exception as e:
|
||||
# Connection not ready...
|
||||
# print("e", end="")
|
||||
time.sleep(2)
|
||||
|
||||
print("DB connection ready")
|
||||
|
||||
def initialize_tables():
|
||||
# Connect to an existing database
|
||||
with psycopg.connect(connection_info) as conn:
|
||||
# Open a cursor to perform database operations
|
||||
with conn.cursor() as cur:
|
||||
# Autocommit at end of transaction (Atomic creation of tables)
|
||||
with conn.transaction() as tx:
|
||||
# Create URLs table
|
||||
c = cur.execute("""
|
||||
CREATE TYPE URL_STATUS AS ENUM ('raw', 'error', 'valid', 'unknown', 'invalid', 'duplicate');
|
||||
|
||||
CREATE TABLE URLS (
|
||||
id SERIAL PRIMARY KEY,
|
||||
url TEXT NOT NULL UNIQUE,
|
||||
ts_fetch TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
status URL_STATUS NOT NULL DEFAULT 'raw' -- ,
|
||||
-- status_wendy WENDY_STATUS DEFAULT NULL,
|
||||
-- ts_wendy TIMESTAMPTZ DEFAULT NULL
|
||||
);
|
||||
CREATE INDEX idx_urls_status ON urls(status);
|
||||
CREATE INDEX idx_urls_ts_fetch ON urls(ts_fetch);
|
||||
|
||||
CREATE TABLE URLS_DUPLICATE (
|
||||
id_url_canonical INTEGER REFERENCES URLS(id),
|
||||
id_url_duplicated INTEGER REFERENCES URLS(id),
|
||||
PRIMARY KEY (id_url_canonical, id_url_duplicated)
|
||||
);
|
||||
|
||||
CREATE TYPE SEARCH_TYPE AS ENUM ('rss_feed', 'keyword_search', 'url_host');
|
||||
CREATE TABLE SEARCH (
|
||||
id SMALLSERIAL PRIMARY KEY,
|
||||
search TEXT NOT NULL UNIQUE,
|
||||
type SEARCH_TYPE NOT NULL
|
||||
-- language_country CHAR(5), -- Language: ISO 639-1 Code. Country: ISO 3166 ALPHA-2. e.g.: en-us. Required for search
|
||||
-- UNIQUE(search, language_country)
|
||||
);
|
||||
CREATE INDEX idx_search_type ON SEARCH(type);
|
||||
|
||||
CREATE TABLE SOURCE (
|
||||
id SMALLSERIAL PRIMARY KEY,
|
||||
source TEXT NOT NULL UNIQUE
|
||||
);
|
||||
|
||||
-- CREATE TABLE SEARCH_LANGUAGE (
|
||||
-- language CHAR(2) NOT NULL, -- ISO 639-1 Code, e.g. "en"
|
||||
-- country CHAR(2) NOT NULL, -- ISO 3166 ALPHA-2, e.g. "us"
|
||||
-- PRIMARY KEY (language, country)
|
||||
-- );
|
||||
|
||||
CREATE TABLE URLS_SOURCE_SEARCH (
|
||||
id_url INTEGER REFERENCES URLS(id),
|
||||
id_source SMALLINT REFERENCES SOURCE(id) ON UPDATE CASCADE ON DELETE RESTRICT,
|
||||
id_search SMALLINT REFERENCES SEARCH(id) ON UPDATE CASCADE ON DELETE RESTRICT,
|
||||
PRIMARY KEY(id_url, id_source, id_search)
|
||||
);
|
||||
CREATE INDEX idx_source ON URLS_SOURCE_SEARCH(id_source);
|
||||
CREATE INDEX idx_search ON URLS_SOURCE_SEARCH(id_search);
|
||||
|
||||
CREATE TABLE STATUS_PATTERN_MATCHING (
|
||||
pattern TEXT PRIMARY KEY,
|
||||
priority SMALLINT NOT NULL,
|
||||
status URL_STATUS NOT NULL
|
||||
);
|
||||
|
||||
|
||||
CREATE TABLE URL_CONTENT (
|
||||
id_url INTEGER PRIMARY KEY REFERENCES URLS(id),
|
||||
date_published TIMESTAMPTZ DEFAULT NOW(),
|
||||
title TEXT,
|
||||
description TEXT,
|
||||
content TEXT,
|
||||
valid_content BOOLEAN,
|
||||
language CHAR(2), -- ISO 639-1 Code
|
||||
keywords TEXT[],
|
||||
tags TEXT[],
|
||||
authors TEXT[],
|
||||
image_main_url TEXT,
|
||||
images_url TEXT[],
|
||||
videos_url TEXT[],
|
||||
url_host TEXT, -- www.breitbart.com
|
||||
site_name TEXT -- Breitbart News
|
||||
);
|
||||
CREATE INDEX idx_tags ON URL_CONTENT USING GIN(tags);
|
||||
CREATE INDEX idx_authors ON URL_CONTENT USING GIN(authors);
|
||||
CREATE INDEX idx_date_published ON URL_CONTENT (date_published);
|
||||
CREATE INDEX idx_valid_content ON URL_CONTENT (valid_content);
|
||||
CREATE INDEX idx_language ON URL_CONTENT (language);
|
||||
CREATE INDEX idx_url_host ON URL_CONTENT (url_host);
|
||||
""")
|
||||
|
||||
def initialize_data():
|
||||
# Connect to an existing database
|
||||
with psycopg.connect(connection_info) as conn:
|
||||
# Open a cursor to perform database operations
|
||||
with conn.cursor() as cur:
|
||||
# Autocommit at end of transaction (Atomic creation of data)
|
||||
with conn.transaction() as tx:
|
||||
# Feeds
|
||||
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC', 'rss_feed');" )
|
||||
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://feeds.feedburner.com/breitbart', 'rss_feed');" )
|
||||
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('http://feeds.feedburner.com/zerohedge/feed', 'rss_feed');" )
|
||||
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://moxie.foxnews.com/google-publisher/latest.xml', 'rss_feed');" )
|
||||
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=15837362', 'rss_feed');" )
|
||||
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100727362', 'rss_feed');" )
|
||||
# Websites of interest
|
||||
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/poster', 'url_host');" )
|
||||
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/new-poster', 'url_host');" )
|
||||
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('breitbart.com', 'url_host');" )
|
||||
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('zerohedge.com', 'url_host');" )
|
||||
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('foxnews.com', 'url_host');" )
|
||||
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('cnbc.com', 'url_host');" )
|
||||
# Search keywords
|
||||
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');" )
|
||||
# TODO: Language per search
|
||||
# cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-us');" )
|
||||
# cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-gb');" )
|
||||
|
||||
# Status update based on pattern matching (with priority to apply in order). Regex test https://regex101.com/
|
||||
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("youtube.com/"))) )
|
||||
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("yewtu.be/"))) )
|
||||
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("tiktok.com/"))) )
|
||||
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("twitter.com/"))) )
|
||||
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("reddit.com/"))) )
|
||||
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("libreddit.de/"))) )
|
||||
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("radio.foxnews.com/"))) )
|
||||
|
||||
""" # TODO: To review with new scheme
|
||||
# Status update based on pattern matching (with priority to apply in order)
|
||||
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*cnbc.com/(video|quotes)/.*', 100, 'invalid');" )
|
||||
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*foxnews.com/(video|category)/.*', 100, 'invalid');" )
|
||||
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*breitbart.com/(tag|author)/.*', 100, 'invalid');" )
|
||||
|
||||
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*zerohedge.com/(economics|political|markets)/.*', 50, 'valid');" )
|
||||
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*breitbart.com/(economy|entertainment|border|crime|clips)/.*', 50, 'valid');" )
|
||||
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*foxnews.com/(lifestyle|opinion|sports|world)/.*', 50, 'valid');" )
|
||||
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*cnbc.com/[0-9]{4}/[0-9]{2}/[0-9]{2}/.*', 50, 'valid');" )
|
||||
|
||||
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*bbc.com/news/.*', 50, 'valid');" )
|
||||
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*msn.com/[A-z]{2}-[A-z]{2}/news/.*', 50, 'valid');" )
|
||||
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*newschannel9.com/news/.*', 50, 'valid');" )
|
||||
|
||||
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*radaronline.com/p.*', 25, 'valid');" )
|
||||
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*okmagazine.com/p.*', 25, 'valid');" )
|
||||
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*9news.com.au/national.*', 25, 'valid');" )
|
||||
"""
|
||||
|
||||
|
||||
def main(name):
|
||||
print('Hello, %s!' % name)
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Database initialization')
|
||||
parser.add_argument('--initialize_tables', help='Create DB tables', action='store_true', default=False)
|
||||
parser.add_argument('--initialize_data', help='Insert data', action='store_true', default=False)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Wait for DB connection
|
||||
wait_connection()
|
||||
|
||||
if (args.initialize_tables):
|
||||
print("Initializing tables")
|
||||
initialize_tables()
|
||||
if (args.initialize_data):
|
||||
print("Initializing data")
|
||||
initialize_data()
|
||||
@@ -81,6 +81,7 @@ class DB_Handler():
|
||||
except Exception as e:
|
||||
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||
|
||||
|
||||
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error):
|
||||
|
||||
def set_status(obj_url, status):
|
||||
@@ -89,17 +90,17 @@ class DB_Handler():
|
||||
obj_url.status = status
|
||||
obj_url.save()
|
||||
|
||||
##### Filter URL? -> Invalid
|
||||
if (status_pattern_match == "invalid"):
|
||||
logger.debug("Domain filter applied to input URL: {}".format(obj_url.url))
|
||||
# Found a pattern match -> Override status
|
||||
if (status_pattern_match is not None):
|
||||
logger.debug("Pattern match, status '{}' for input URL: {}".format(status_pattern_match, obj_url.url))
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.INVALID)
|
||||
# Next URL
|
||||
return
|
||||
set_status(obj_url, status_pattern_match)
|
||||
##### Filter URL? -> Invalid (don't extract content)
|
||||
if (status_pattern_match == "invalid"):
|
||||
return
|
||||
|
||||
##### Process URL
|
||||
try:
|
||||
# Get data
|
||||
# Extract URL content
|
||||
dict_url_data = process_url(obj_url.url)
|
||||
except Exception as e:
|
||||
if (raise_exception_on_error):
|
||||
@@ -110,25 +111,10 @@ class DB_Handler():
|
||||
# Set status to error
|
||||
dict_url_data = None
|
||||
|
||||
# (dict_url_data is None) or (Exception while processing URL) ? -> Error status
|
||||
if (dict_url_data is None):
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.ERROR)
|
||||
# Next URL
|
||||
return
|
||||
|
||||
# Invalid? e.g. binary data
|
||||
if (dict_url_data.get("override_status") == "invalid"):
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.INVALID)
|
||||
# Next URL
|
||||
return
|
||||
|
||||
##### Canonical URL different? -> Duplicate
|
||||
if (dict_url_data.get("url_canonical") is not None) and(dict_url_data.get("url") != dict_url_data.get("url_canonical")):
|
||||
if (dict_url_data is not None) and (dict_url_data.get("url_canonical") is not None) and (dict_url_data.get("url") != dict_url_data.get("url_canonical")):
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.DUPLICATE)
|
||||
|
||||
# Get or create URL with canonical form
|
||||
obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical"))
|
||||
# Get the source-search IDs associated to obj_url.id
|
||||
@@ -136,42 +122,54 @@ class DB_Handler():
|
||||
for obj_url_source_search in list_url_source_search:
|
||||
# Associate same sources to url_canonical (it might already exist)
|
||||
UrlsSourceSearch.objects.get_or_create(id_url=obj_url_canonical, id_source=obj_url_source_search.id_source, id_search=obj_url_source_search.id_search)
|
||||
|
||||
# URLs duplciate association
|
||||
UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url)
|
||||
|
||||
# TODO: return obj_url_canonical so as to directly process the recently inserted URL
|
||||
# Wherever this function is called, add:
|
||||
# self._process_single_url(obj_url_canonical, status_pattern_match, raise_exception_on_error)
|
||||
|
||||
# Next URL
|
||||
return
|
||||
|
||||
##### Valid URL
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
||||
# Not overriding status given pattern matching?
|
||||
if (status_pattern_match is None):
|
||||
# (dict_url_data is None) or (Exception while processing URL) ? -> Error status
|
||||
if (dict_url_data is None):
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.ERROR)
|
||||
# Next URL
|
||||
return
|
||||
|
||||
# Invalid? e.g. binary data
|
||||
if (dict_url_data.get("override_status") == "invalid"):
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.INVALID)
|
||||
# Next URL
|
||||
return
|
||||
|
||||
##### Valid URL
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
||||
|
||||
try:
|
||||
# Create or update extracted URL data
|
||||
UrlContent.objects.update_or_create(
|
||||
id_url=obj_url,
|
||||
defaults = {
|
||||
"date_published" : dict_url_data.get("publish_date"),
|
||||
"title" : dict_url_data.get("title"),
|
||||
"description" : dict_url_data.get("description"),
|
||||
"content" : dict_url_data.get("content"),
|
||||
"valid_content" : dict_url_data.get("valid_content"),
|
||||
"language" : dict_url_data.get("language"),
|
||||
"keywords" : dict_url_data.get("keywords"),
|
||||
"tags" : dict_url_data.get("tags"),
|
||||
"authors" : dict_url_data.get("authors"),
|
||||
"image_main_url" : dict_url_data.get("image_main_url"),
|
||||
"images_url" : dict_url_data.get("images_url"),
|
||||
"videos_url" : dict_url_data.get("videos_url"),
|
||||
"url_host" : dict_url_data.get("url_host"),
|
||||
"site_name" : dict_url_data.get("site_name"),
|
||||
}
|
||||
)
|
||||
if (dict_url_data is not None):
|
||||
# Create or update extracted URL data
|
||||
UrlContent.objects.update_or_create(
|
||||
id_url=obj_url,
|
||||
defaults = {
|
||||
"date_published" : dict_url_data.get("publish_date"),
|
||||
"title" : dict_url_data.get("title"),
|
||||
"description" : dict_url_data.get("description"),
|
||||
"content" : dict_url_data.get("content"),
|
||||
"valid_content" : dict_url_data.get("valid_content"),
|
||||
"language" : dict_url_data.get("language"),
|
||||
"keywords" : dict_url_data.get("keywords"),
|
||||
"tags" : dict_url_data.get("tags"),
|
||||
"authors" : dict_url_data.get("authors"),
|
||||
"image_main_url" : dict_url_data.get("image_main_url"),
|
||||
"images_url" : dict_url_data.get("images_url"),
|
||||
"videos_url" : dict_url_data.get("videos_url"),
|
||||
"url_host" : dict_url_data.get("url_host"),
|
||||
"site_name" : dict_url_data.get("site_name"),
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug("Error in update_or_create UrlContent: {}\ndict_url_data: {}\n{}\n{}".format(obj_url.url, dict_url_data, str(e), traceback.format_exc()))
|
||||
|
||||
@@ -179,13 +177,12 @@ class DB_Handler():
|
||||
def process_raw_urls(self, batch_size):
|
||||
|
||||
def _get_status_pattern_matching(url, list_pattern_status_tuple):
|
||||
""" Be careful: Regex pattern should update status on "valid", "invalid", and "unknown" status only
|
||||
"""
|
||||
""" Be careful: Regex pattern should update status on "valid", "invalid", and "unknown" status only """
|
||||
# Sort pattern tuples by priority. (pattern, priority, status)
|
||||
for regex_pattern, regex_priority, status_if_match in sorted(list_pattern_status_tuple, key=lambda tup: tup[1], reverse=True):
|
||||
# Regular expression pattern matching: https://regexr.com/
|
||||
if bool(re.match(regex_pattern, obj_url.url)):
|
||||
logger.debug("Regex pattern found, status '{}' for URL: {}".format(status_if_match, url))
|
||||
# logger.debug("Regex pattern found, status '{}' for URL: {}".format(status_if_match, url))
|
||||
return status_if_match
|
||||
return None
|
||||
|
||||
|
||||
@@ -53,7 +53,7 @@ class FetchSearcher():
|
||||
|
||||
for SearchInstance in ListSearchInstances:
|
||||
# Sleep between requests, avoid too many requests...
|
||||
time.sleep(int(os.getenv("FETCHER_BETWEEN_SEARCHES_SLEEP", 5)))
|
||||
time.sleep(float(os.getenv("FETCHER_BETWEEN_SEARCHES_SLEEP", 5)))
|
||||
SearchInstance(args).fetch_articles(db_writer, obj_search)
|
||||
|
||||
# TODO: https://github.com/tasos-py/Search-Engines-Scraper/tree/master
|
||||
|
||||
@@ -55,8 +55,8 @@ class FetcherAbstract(ABC):
|
||||
keyword_search = "{}{}".format("site:", keyword_search)
|
||||
# Keyword search & using a General search? -> ${SEARCH} news after:${LAST_WEEK}
|
||||
if ("general" in source_name) and (obj_search.type == Search.TYPE_ENUM.KEYWORD_SEARCH):
|
||||
start_date = timezone.now() - timedelta(days=7)
|
||||
keyword_search = "{}{}".format(keyword_search, "news after:{}-{}-{}".format(start_date.month, start_date.day, start_date.year))
|
||||
logger.debug("Appending news to general search")
|
||||
keyword_search = "{}{}".format(keyword_search, "news")
|
||||
|
||||
logger.debug("Starting search: {} - {}".format(keyword_search, source_name))
|
||||
# Fetch
|
||||
@@ -194,7 +194,7 @@ class SearchGoogleGeneral(FetcherAbstract):
|
||||
# Iterate pages
|
||||
for i in range(self.pages):
|
||||
# Sleep between pages fetch
|
||||
time.sleep(int(os.getenv("FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP", 4)))
|
||||
time.sleep(float(os.getenv("FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP", 4)))
|
||||
# Number of URLs fetched so far
|
||||
num_before = len(set_links)
|
||||
# Get page
|
||||
|
||||
@@ -6,7 +6,7 @@ logger = get_logger()
|
||||
from googlenewsdecoder import gnewsdecoder
|
||||
|
||||
|
||||
def decode_gnews_urls(encoded_urls, interval=int(os.getenv("FETCHER_GNEWS_DECODE_SLEEP", 2))):
|
||||
def decode_gnews_urls(encoded_urls, interval=float(os.getenv("FETCHER_GNEWS_DECODE_SLEEP", 2))):
|
||||
logger.debug("Decoding gnews URLs")
|
||||
# DecodeURLs
|
||||
list_decoded_urls = []
|
||||
|
||||
@@ -41,7 +41,7 @@ def url_host_slowdown(url, url_host_slowdown_seconds):
|
||||
def process_url(url):
|
||||
try:
|
||||
# Slow down if required to avoid too many requests error
|
||||
url_host_slowdown(url, url_host_slowdown_seconds=int(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
|
||||
url_host_slowdown(url, url_host_slowdown_seconds=float(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
|
||||
# Process
|
||||
article = newspaper.article(url)
|
||||
except newspaper.ArticleBinaryDataException:
|
||||
|
||||
@@ -4,6 +4,7 @@ from . import views
|
||||
urlpatterns = [
|
||||
path('', views.link_list, name='link_list'),
|
||||
#
|
||||
path('logs/database', views.log_db, name='log_db'),
|
||||
path('logs/<str:log_type>', views.logs, name='logs'),
|
||||
#
|
||||
path('task/<str:task>', views.trigger_task, name='trigger_task'),
|
||||
@@ -17,4 +18,5 @@ urlpatterns = [
|
||||
path('urls/', views.filtered_urls, name='filtered_urls'),
|
||||
path('urls/<int:id>/', views.url_detail_view, name='url_detail'),
|
||||
path('urls/<int:id>/fetch/', views.fetch_details, name='fetch_details'),
|
||||
path('urls/content_generation', views.content_generation, name='content_generation'),
|
||||
]
|
||||
|
||||
@@ -1,44 +1,16 @@
|
||||
from .tasks import background_task
|
||||
from .views_base import link_list, logs, log_db, trigger_task
|
||||
|
||||
from django.core.paginator import Paginator
|
||||
from django.shortcuts import render, get_object_or_404
|
||||
from django.http import StreamingHttpResponse, JsonResponse, HttpResponse
|
||||
from django.contrib.auth.decorators import login_required
|
||||
import ollama
|
||||
from django.http import StreamingHttpResponse, JsonResponse
|
||||
from django.db.models import Q, Count
|
||||
from django.utils import timezone
|
||||
from django.utils.timezone import now, timedelta
|
||||
from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch, UrlsDuplicate
|
||||
import ollama
|
||||
import os
|
||||
from .src.logger import get_logger
|
||||
logger = get_logger()
|
||||
#from datetime import timedelta
|
||||
|
||||
####################################################################################################
|
||||
def trigger_task(request, task):
|
||||
# Enqueue function in "default" queue
|
||||
background_task.delay(task)
|
||||
return JsonResponse({"message": "Task has been enqueued!", "task": task})
|
||||
|
||||
####################################################################################################
|
||||
def link_list(request):
|
||||
# Base URL path
|
||||
app_url = request.build_absolute_uri()
|
||||
# Tasks
|
||||
links_fetch = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all"]
|
||||
links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"]
|
||||
# List of links
|
||||
list_links = \
|
||||
[ os.path.join(app_url, "admin"), os.path.join(app_url, "urls") ] + \
|
||||
[ os.path.join(app_url, "logs", log_type) for log_type in ["debug", "info", "warning"] ] + \
|
||||
[ os.path.join(app_url, "task", l) for l in links_fetch + links_process ]
|
||||
# Json
|
||||
return JsonResponse({"links": list_links })
|
||||
|
||||
####################################################################################################
|
||||
def logs(request, log_type):
|
||||
# Capture output: python manage.py rqstats
|
||||
try:
|
||||
with open( os.path.join( os.getenv("PATH_LOGS_DIRECTORY", "logs"), "{}.log".format(log_type) ), "r") as f:
|
||||
file_content = f.read()
|
||||
except Exception as e:
|
||||
file_content = "Error reading logs for log type :{}".format(log_type)
|
||||
return HttpResponse(file_content, content_type="text/plain")
|
||||
|
||||
####################################################################################################
|
||||
class OllamaClient():
|
||||
@@ -57,13 +29,6 @@ class OllamaClient():
|
||||
|
||||
def get_prompt(self):
|
||||
return "Rewrite the text below into a clear and concise summary, presenting the key points as if they are newly written insights. Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states'. Instead, write in a natural, standalone format that feels like an original explanation. Keep it brief, engaging, informative, in the style of a news article, and no longer than a paragraph:"
|
||||
#return "Provide a summary of the content below, avoid mentioning the source of information, and only answer with the summary. The summary needs to be brief and compact, consisting of one paragraph."
|
||||
#return "Explain in a single and compact paragraph the what, why, when, where, who, and how of the content below. Also provide a single paragraph summary of the content:"
|
||||
#return "Provide in one paragraph the what, why, when, where, who, and how of the content below. Also provide a one paragraph summary of the content:"
|
||||
#return "Provide two summaries of the content below, and avoid mentioning the source of information. First, provide a very brief and compact paragraph summary. Second, provide a larger and more detailed summary, which describe the what, why, when, where, who, and how of the content:"
|
||||
# return "Imagine you are a journalist, TLDR in a paragraph. Only answer with the summary:"
|
||||
#return "Below you will find the whole content of a news article:\n{}\nProvide a concise summary of one paragraph maximum of the content.".format(content)
|
||||
|
||||
|
||||
def fetch_details(request, id):
|
||||
url_item = get_object_or_404(Urls, id=id)
|
||||
@@ -83,7 +48,6 @@ def fetch_details(request, id):
|
||||
|
||||
return StreamingHttpResponse(stream_response(), content_type="text/plain")
|
||||
|
||||
|
||||
def url_detail_view(request, id):
|
||||
url_item = get_object_or_404(Urls, id=id)
|
||||
url_sources = list(Source.objects.filter(urlssourcesearch__id_url=url_item).distinct())
|
||||
@@ -114,13 +78,6 @@ def url_detail_view(request, id):
|
||||
return render(request, 'url_detail.html', context)
|
||||
|
||||
####################################################################################################
|
||||
from django.shortcuts import render
|
||||
from django.http import JsonResponse
|
||||
from django.db.models import Count
|
||||
from datetime import timedelta
|
||||
from django.utils import timezone
|
||||
from .models import Urls, UrlsSourceSearch
|
||||
|
||||
def charts(request):
|
||||
return render(request, 'charts.html')
|
||||
|
||||
@@ -202,14 +159,7 @@ def urls_per_search(request):
|
||||
|
||||
return JsonResponse(data)
|
||||
|
||||
|
||||
|
||||
####################################################################################################
|
||||
from django.shortcuts import render
|
||||
from .models import Urls, Search, Source
|
||||
from django.db.models import Q
|
||||
from django.utils.timezone import now, timedelta
|
||||
|
||||
|
||||
def filtered_urls(request):
|
||||
statuses = Urls.STATUS_ENUM.choices
|
||||
@@ -343,3 +293,15 @@ def filtered_urls(request):
|
||||
|
||||
return render(request, 'filtered_urls.html', context)
|
||||
####################################################################################################
|
||||
|
||||
def content_generation(request):
|
||||
# https://fetcher.matitos.org/urls/?per_page=100&days=1&valid_content=True&min_sources=1&search=13&status=all&language=all&source=all
|
||||
'''
|
||||
# Get list of URLs ID
|
||||
selected_urls = request.GET.getlist('urls', [])
|
||||
|
||||
# Sample URLs
|
||||
selected_urls = [13460, 13455, 13454, 13452, 13210]
|
||||
'''
|
||||
|
||||
####################################################################################################
|
||||
74
app_urls/fetcher/views_base.py
Normal file
74
app_urls/fetcher/views_base.py
Normal file
@@ -0,0 +1,74 @@
|
||||
import os
|
||||
import psycopg
|
||||
from .tasks import background_task
|
||||
from django.http import JsonResponse, HttpResponse
|
||||
|
||||
####################################################################################################
|
||||
def trigger_task(request, task):
|
||||
# Enqueue function in "default" queue
|
||||
background_task.delay(task)
|
||||
return JsonResponse({"message": "Task has been enqueued!", "task": task})
|
||||
|
||||
####################################################################################################
|
||||
def link_list(request):
|
||||
# Base URL path
|
||||
app_url = request.build_absolute_uri()
|
||||
# Tasks
|
||||
links_fetch = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all"]
|
||||
links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"]
|
||||
# List of links
|
||||
list_links = \
|
||||
[ os.path.join(app_url, "admin"), os.path.join(app_url, "urls") ] + \
|
||||
[ os.path.join(app_url, "logs", log_type) for log_type in ["database", "debug", "info", "warning"] ] + \
|
||||
[ os.path.join(app_url, "task", l) for l in links_fetch + links_process ]
|
||||
# Json
|
||||
return JsonResponse({"links": list_links })
|
||||
|
||||
####################################################################################################
|
||||
def logs(request, log_type):
|
||||
# Capture output: python manage.py rqstats
|
||||
try:
|
||||
with open( os.path.join( os.getenv("PATH_LOGS_DIRECTORY", "logs"), "{}.log".format(log_type) ), "r") as f:
|
||||
file_content = f.read()
|
||||
except Exception as e:
|
||||
file_content = "Error reading logs for log type :{}".format(log_type)
|
||||
return HttpResponse(file_content, content_type="text/plain")
|
||||
|
||||
####################################################################################################
|
||||
def log_db(request):
|
||||
# TODO: Django connection
|
||||
connection_info = "host={} port={} dbname={} user={} password={} connect_timeout=60".format(
|
||||
os.environ.get("DB_HOST", "localhost"),
|
||||
os.environ.get("DB_PORT", "5432"),
|
||||
os.environ.get("DB_NAME", "matitos"),
|
||||
os.environ.get("DB_USER", "supermatitos"),
|
||||
os.environ.get("DB_PASSWORD", "supermatitos")
|
||||
)
|
||||
|
||||
# Connect to an existing database
|
||||
with psycopg.connect(connection_info) as conn:
|
||||
# Open a cursor to perform database operations
|
||||
with conn.cursor() as cur:
|
||||
# Create URLs table
|
||||
r = cur.execute("""
|
||||
SELECT
|
||||
relname AS "relation",
|
||||
pg_size_pretty (
|
||||
pg_total_relation_size (C .oid)
|
||||
) AS "total_size"
|
||||
FROM
|
||||
pg_class C
|
||||
LEFT JOIN pg_namespace N ON (N.oid = C .relnamespace)
|
||||
WHERE
|
||||
nspname NOT IN (
|
||||
'pg_catalog',
|
||||
'information_schema'
|
||||
)
|
||||
AND C .relkind <> 'i'
|
||||
AND nspname !~ '^pg_toast'
|
||||
ORDER BY
|
||||
pg_total_relation_size (C .oid) DESC
|
||||
LIMIT 100;
|
||||
""").fetchall()
|
||||
return HttpResponse( "\n".join([str(e) for e in r]) )
|
||||
####################################################################################################
|
||||
34
app_urls/init_data.json
Normal file
34
app_urls/init_data.json
Normal file
@@ -0,0 +1,34 @@
|
||||
{
|
||||
"SEARCH": {
|
||||
"rss_feed": [
|
||||
"https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC",
|
||||
"https://feeds.feedburner.com/breitbart",
|
||||
"https://feeds.feedburner.com/zerohedge/feed",
|
||||
"https://moxie.foxnews.com/google-publisher/latest.xml",
|
||||
"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=15837362",
|
||||
"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100727362"
|
||||
],
|
||||
"url_host": [
|
||||
"missingkids.org/poster",
|
||||
"missingkids.org/new-poster",
|
||||
"breitbart.com",
|
||||
"zerohedge.com",
|
||||
"foxnews.com",
|
||||
"cnbc.com"
|
||||
],
|
||||
"keyword_search": [
|
||||
"child abuse"
|
||||
]
|
||||
},
|
||||
"REGEX_PATTERN_STATUS_PRIORITY": [
|
||||
[".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50],
|
||||
[".*cnbc\\.com\\/(video|quotes)\\/.*", "invalid", 75],
|
||||
[".*foxnews\\.com\\/(video|category)\\/.*", "invalid", 75],
|
||||
[".*radio.foxnews\\.com\\/.*", "invalid", 75],
|
||||
[".*breitbart\\.com\\/(tag|author)\\/.*", "invalid", 75],
|
||||
[".*zerohedge\\.com\\/(user)\\/.*", "invalid", 75],
|
||||
[".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50],
|
||||
[".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50],
|
||||
[".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50]
|
||||
]
|
||||
}
|
||||
66
app_urls/init_data_fr.json
Normal file
66
app_urls/init_data_fr.json
Normal file
@@ -0,0 +1,66 @@
|
||||
{
|
||||
"SEARCH": {
|
||||
"rss_feed": [
|
||||
],
|
||||
"url_host": [
|
||||
"johnpilger.com",
|
||||
"lapenseeecologique.com",
|
||||
"partage-le.com",
|
||||
"reflets.info",
|
||||
"rezo.net",
|
||||
"consortiumnews.com",
|
||||
"disclose.ngo/fr",
|
||||
"energieetenvironnement.com",
|
||||
"global-climat.com",
|
||||
"lapenseeecologique.com",
|
||||
"slashdot.org",
|
||||
"lesamisdebartleby.wordpress.com",
|
||||
"lundi.am",
|
||||
"lvsl.fr",
|
||||
"moderndiplomacy.eu",
|
||||
"mrmondialisation.org",
|
||||
"ourfiniteworld.com",
|
||||
"southfront.org",
|
||||
"simplicius76.substack.com",
|
||||
"smoothiex12.blogspot.com",
|
||||
"theintercept.com",
|
||||
"wikileaks.org",
|
||||
"contretemps.eu",
|
||||
"indianpunchline.com",
|
||||
"investigaction.net/fr",
|
||||
"notechmagazine.com",
|
||||
"terrestres.org",
|
||||
"truthdig.com",
|
||||
"tass.com",
|
||||
"bastamag.net",
|
||||
"counterpunch.org",
|
||||
"energy-daily.com",
|
||||
"fakirpresse.info",
|
||||
"geopoliticalmonitor.com",
|
||||
"huffingtonpost.fr",
|
||||
"legrandsoir.info",
|
||||
"les-crises.fr",
|
||||
"liberation.fr",
|
||||
"maitre-eolas.fr",
|
||||
"marianne.net",
|
||||
"mediapart.fr",
|
||||
"metaefficient.com",
|
||||
"monde-diplomatique.fr",
|
||||
"paulcraigroberts.org",
|
||||
"politis.fr",
|
||||
"reporterre.net",
|
||||
"rue89.com",
|
||||
"theguardian.com/international",
|
||||
"treehugger.com",
|
||||
"unz.com",
|
||||
"voltairenet.org",
|
||||
"wsws.org"
|
||||
],
|
||||
"keyword_search": [
|
||||
"society collapse"
|
||||
]
|
||||
},
|
||||
"REGEX_PATTERN_STATUS_PRIORITY": [
|
||||
[".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50]
|
||||
]
|
||||
}
|
||||
244
app_urls/init_db.py
Normal file
244
app_urls/init_db.py
Normal file
@@ -0,0 +1,244 @@
|
||||
import argparse
|
||||
import os
|
||||
import psycopg
|
||||
import json
|
||||
import time
|
||||
import urllib.parse
|
||||
import html5lib
|
||||
import feedparser
|
||||
import requests
|
||||
|
||||
connection_info = "host={} port={} dbname={} user={} password={} connect_timeout=60".format(
|
||||
os.environ.get("DB_HOST", "localhost"),
|
||||
os.environ.get("DB_PORT", "5432"),
|
||||
os.environ.get("DB_NAME", "matitos"),
|
||||
os.environ.get("DB_USER", "supermatitos"),
|
||||
os.environ.get("DB_PASSWORD", "supermatitos")
|
||||
)
|
||||
|
||||
def wait_connection():
|
||||
connected = False
|
||||
while (not connected):
|
||||
try:
|
||||
# Connect to an existing database
|
||||
with psycopg.connect(connection_info) as conn:
|
||||
# Open a cursor to perform database operations
|
||||
with conn.cursor() as cur:
|
||||
# Create URLs table
|
||||
c = cur.execute("SELECT 1;").fetchall()
|
||||
connected = True
|
||||
|
||||
except psycopg.OperationalError as e:
|
||||
# Connection not ready...
|
||||
# print(".", end="")
|
||||
time.sleep(2)
|
||||
except Exception as e:
|
||||
# Connection not ready...
|
||||
# print("e", end="")
|
||||
time.sleep(2)
|
||||
|
||||
print("DB connection ready")
|
||||
|
||||
def initialize_tables():
|
||||
# Connect to an existing database
|
||||
with psycopg.connect(connection_info) as conn:
|
||||
# Open a cursor to perform database operations
|
||||
with conn.cursor() as cur:
|
||||
# Autocommit at end of transaction (Atomic creation of tables)
|
||||
with conn.transaction() as tx:
|
||||
try:
|
||||
# Create URLs table
|
||||
c = cur.execute("""
|
||||
CREATE TYPE URL_STATUS AS ENUM ('raw', 'error', 'valid', 'unknown', 'invalid', 'duplicate');
|
||||
|
||||
CREATE TABLE URLS (
|
||||
id SERIAL PRIMARY KEY,
|
||||
url TEXT NOT NULL UNIQUE,
|
||||
ts_fetch TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
status URL_STATUS NOT NULL DEFAULT 'raw' -- ,
|
||||
-- status_wendy WENDY_STATUS DEFAULT NULL,
|
||||
-- ts_wendy TIMESTAMPTZ DEFAULT NULL
|
||||
);
|
||||
CREATE INDEX idx_urls_status ON urls(status);
|
||||
CREATE INDEX idx_urls_ts_fetch ON urls(ts_fetch);
|
||||
|
||||
CREATE TABLE URLS_DUPLICATE (
|
||||
id_url_canonical INTEGER REFERENCES URLS(id),
|
||||
id_url_duplicated INTEGER REFERENCES URLS(id),
|
||||
PRIMARY KEY (id_url_canonical, id_url_duplicated)
|
||||
);
|
||||
|
||||
CREATE TYPE SEARCH_TYPE AS ENUM ('rss_feed', 'keyword_search', 'url_host');
|
||||
CREATE TABLE SEARCH (
|
||||
id SMALLSERIAL PRIMARY KEY,
|
||||
search TEXT NOT NULL UNIQUE,
|
||||
type SEARCH_TYPE NOT NULL
|
||||
-- language_country CHAR(5), -- Language: ISO 639-1 Code. Country: ISO 3166 ALPHA-2. e.g.: en-us. Required for search
|
||||
-- UNIQUE(search, language_country)
|
||||
);
|
||||
CREATE INDEX idx_search_type ON SEARCH(type);
|
||||
|
||||
CREATE TABLE SOURCE (
|
||||
id SMALLSERIAL PRIMARY KEY,
|
||||
source TEXT NOT NULL UNIQUE
|
||||
);
|
||||
|
||||
-- CREATE TABLE SEARCH_LANGUAGE (
|
||||
-- language CHAR(2) NOT NULL, -- ISO 639-1 Code, e.g. "en"
|
||||
-- country CHAR(2) NOT NULL, -- ISO 3166 ALPHA-2, e.g. "us"
|
||||
-- PRIMARY KEY (language, country)
|
||||
-- );
|
||||
|
||||
CREATE TABLE URLS_SOURCE_SEARCH (
|
||||
id_url INTEGER REFERENCES URLS(id),
|
||||
id_source SMALLINT REFERENCES SOURCE(id) ON UPDATE CASCADE ON DELETE RESTRICT,
|
||||
id_search SMALLINT REFERENCES SEARCH(id) ON UPDATE CASCADE ON DELETE RESTRICT,
|
||||
PRIMARY KEY(id_url, id_source, id_search)
|
||||
);
|
||||
CREATE INDEX idx_source ON URLS_SOURCE_SEARCH(id_source);
|
||||
CREATE INDEX idx_search ON URLS_SOURCE_SEARCH(id_search);
|
||||
|
||||
CREATE TABLE STATUS_PATTERN_MATCHING (
|
||||
pattern TEXT PRIMARY KEY,
|
||||
priority SMALLINT NOT NULL,
|
||||
status URL_STATUS NOT NULL
|
||||
);
|
||||
|
||||
|
||||
CREATE TABLE URL_CONTENT (
|
||||
id_url INTEGER PRIMARY KEY REFERENCES URLS(id),
|
||||
date_published TIMESTAMPTZ DEFAULT NOW(),
|
||||
title TEXT,
|
||||
description TEXT,
|
||||
content TEXT,
|
||||
valid_content BOOLEAN,
|
||||
language CHAR(2), -- ISO 639-1 Code
|
||||
keywords TEXT[],
|
||||
tags TEXT[],
|
||||
authors TEXT[],
|
||||
image_main_url TEXT,
|
||||
images_url TEXT[],
|
||||
videos_url TEXT[],
|
||||
url_host TEXT, -- www.breitbart.com
|
||||
site_name TEXT -- Breitbart News
|
||||
);
|
||||
CREATE INDEX idx_tags ON URL_CONTENT USING GIN(tags);
|
||||
CREATE INDEX idx_authors ON URL_CONTENT USING GIN(authors);
|
||||
CREATE INDEX idx_date_published ON URL_CONTENT (date_published);
|
||||
CREATE INDEX idx_valid_content ON URL_CONTENT (valid_content);
|
||||
CREATE INDEX idx_language ON URL_CONTENT (language);
|
||||
CREATE INDEX idx_url_host ON URL_CONTENT (url_host);
|
||||
""")
|
||||
except Exception as e:
|
||||
print(str(e))
|
||||
|
||||
|
||||
def find_feeds(url):
|
||||
list_feeds = []
|
||||
try:
|
||||
def get_with_protocol(url):
|
||||
# http:// -> https://
|
||||
url = url.replace("http://", "https://")
|
||||
# "" -> https://
|
||||
if not (url.startswith("https://")):
|
||||
url = "https://" + url
|
||||
return url
|
||||
url = get_with_protocol(url)
|
||||
|
||||
response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
|
||||
html = response.text
|
||||
tree = html5lib.parse(html, namespaceHTMLElements=False)
|
||||
|
||||
# base for relative URLs
|
||||
base = tree.findall('.//base')
|
||||
base_url = base[0].attrib['href'] if base and 'href' in base[0].attrib else url
|
||||
|
||||
# prioritize Atom over RSS
|
||||
links = tree.findall("""head/link[@rel='alternate'][@type='application/atom+xml']""") + tree.findall("""head/link[@rel='alternate'][@type='application/rss+xml']""")
|
||||
for link in links:
|
||||
href = link.attrib.get('href', '').strip()
|
||||
if href:
|
||||
r = requests.get(urllib.parse.urljoin(base_url, href), allow_redirects=True)
|
||||
list_feeds.append(r.url)
|
||||
|
||||
# heuristic search for common feed paths
|
||||
for suffix in [
|
||||
'feed', 'feed/', 'rss', 'atom', 'feed.xml',
|
||||
'/feed', '/feed/', '/rss', '/atom', '/feed.xml',
|
||||
'index.atom', 'index.rss', 'index.xml', 'atom.xml', 'rss.xml',
|
||||
'/index.atom', '/index.rss', '/index.xml', '/atom.xml', '/rss.xml',
|
||||
'.rss', '/.rss', '?rss=1', '?feed=rss2',
|
||||
]:
|
||||
try:
|
||||
potential_feed = urllib.parse.urljoin(base_url, suffix)
|
||||
response = requests.get(potential_feed, allow_redirects=True)
|
||||
if (response.status_code == 200) and (len(feedparser.parse(potential_feed).get("entries")) > 0):
|
||||
list_feeds.append(response.url)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
print(f"An error occurred: {e}")
|
||||
|
||||
# Remove duplicates
|
||||
return list(set(list_feeds))
|
||||
|
||||
def initialize_data():
|
||||
# Read data
|
||||
with open("init_data.json", "r") as f:
|
||||
data_json = json.loads(f.read())
|
||||
|
||||
print("Initialization data:", data_json)
|
||||
|
||||
# Connect to an existing database
|
||||
with psycopg.connect(connection_info) as conn:
|
||||
# Open a cursor to perform database operations
|
||||
with conn.cursor() as cur:
|
||||
# Autocommit at end of transaction (Atomic creation of data)
|
||||
# with conn.transaction() as tx:
|
||||
|
||||
# TODO: Language per search
|
||||
# cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-us');" )
|
||||
# cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-gb');" )
|
||||
|
||||
for list_pattern_status_priority in data_json.get("REGEX_PATTERN_STATUS_PRIORITY", []):
|
||||
# Decode
|
||||
pattern, status, priority = list_pattern_status_priority
|
||||
# Query
|
||||
query = "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', {}, '{}');".format(pattern, priority, status)
|
||||
print(query)
|
||||
cur.execute(query)
|
||||
|
||||
# Feeds, URL host, keyword search
|
||||
for search_type, list_searches in data_json.get("SEARCH", {}).items():
|
||||
for search in list_searches:
|
||||
query = "INSERT INTO SEARCH (search, type) VALUES ('{}', '{}');".format(search, search_type)
|
||||
print(query)
|
||||
cur.execute(query)
|
||||
|
||||
# Try finding RSS feed
|
||||
if (search_type == "url_host"):
|
||||
url_host = search
|
||||
list_feeds = find_feeds(url_host)
|
||||
# If not exists, insert feed
|
||||
for feed in list_feeds:
|
||||
query = "INSERT INTO SEARCH (search, type) VALUES ('{}', '{}') ON CONFLICT DO NOTHING;".format(feed, "rss_feed")
|
||||
print(query)
|
||||
cur.execute(query)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Database initialization')
|
||||
parser.add_argument('--initialize_tables', help='Create DB tables', action='store_true', default=False)
|
||||
parser.add_argument('--initialize_data', help='Insert data', action='store_true', default=False)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Wait for DB connection
|
||||
wait_connection()
|
||||
|
||||
if (args.initialize_tables):
|
||||
print("Initializing tables")
|
||||
initialize_tables()
|
||||
if (args.initialize_data):
|
||||
print("Initializing data")
|
||||
initialize_data()
|
||||
@@ -4,7 +4,7 @@ if [ "${INITIALIZE_DB}" = false ]; then
|
||||
echo "Initialization not required"
|
||||
else
|
||||
echo "Initializating database"
|
||||
python db.py --initialize_tables --initialize_data
|
||||
python init_db.py --initialize_tables --initialize_data
|
||||
python manage.py makemigrations fetcher; python manage.py migrate --fake-initial
|
||||
python manage.py createsuperuser --noinput
|
||||
python manage.py collectstatic --no-input
|
||||
@@ -5,6 +5,8 @@ psycopg[binary]
|
||||
gunicorn
|
||||
whitenoise
|
||||
feedparser
|
||||
html5lib
|
||||
requests
|
||||
python-dateutil
|
||||
newspaper4k[all]
|
||||
lxml[html_clean]
|
||||
|
||||
@@ -48,7 +48,7 @@ services:
|
||||
- DJANGO_ALLOWED_HOSTS=${DJANGO_ALLOWED_HOSTS:-*} # host1,host2
|
||||
- DJANGO_ALLOWED_ORIGINS=${ALLOWED_ORIGINS:-https://fetcher.matitos.org} # Reverse proxy
|
||||
- DJANGO_SECRET_KEY=${DJANGO_SECRET_KEY:-abc123456789qwerty}
|
||||
- DJANGO_DEBUG=${DJANGO_DEBUG:-False}
|
||||
- DJANGO_DEBUG=${DJANGO_DEBUG:-True}
|
||||
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY:-/opt/logs}
|
||||
# Database
|
||||
- DB_NAME=${DB_NAME:-matitos}
|
||||
@@ -61,10 +61,11 @@ services:
|
||||
# Job timeout: 30 min
|
||||
- JOB_DEFAULT_TIMEOUT=${JOB_DEFAULT_TIMEOUT:-1800}
|
||||
# Fetcher
|
||||
- FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP-2}
|
||||
- FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP-1.5}
|
||||
- FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP:-5}
|
||||
- FETCHER_BETWEEN_SEARCHES_SLEEP=${FETCHER_BETWEEN_SEARCHES_SLEEP:-1}
|
||||
- FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP:-2}
|
||||
- FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP:-1.5}
|
||||
- FETCHER_LANGUAGE_DETECTION_MIN_CHAR=100
|
||||
# Selenium
|
||||
- SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT:-http://fetcher_app_selenium:80}
|
||||
- ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA:-https://ollamamodel.matitos.org}
|
||||
@@ -105,8 +106,8 @@ services:
|
||||
POSTGRES_PASSWORD: ${DB_PASSWORD:-supermatitos}
|
||||
POSTGRES_USER: ${DB_USER:-supermatitos}
|
||||
POSTGRES_INITDB_ARGS: '--data-checksums'
|
||||
volumes: # Persistent DB?
|
||||
- ${PATH_DB_DATA:-.}/postgres:/var/lib/postgresql/data
|
||||
#volumes: # Persistent DB?
|
||||
# - ${PATH_DB_DATA:-.}/postgres:/var/lib/postgresql/data
|
||||
ports:
|
||||
- 5432 #:5432
|
||||
|
||||
|
||||
@@ -59,10 +59,11 @@ services:
|
||||
# Job timeout: 30 min
|
||||
- JOB_DEFAULT_TIMEOUT=${JOB_DEFAULT_TIMEOUT:-1800}
|
||||
# Fetcher
|
||||
- FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP-2}
|
||||
- FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP-1.5}
|
||||
- FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP:-5}
|
||||
- FETCHER_BETWEEN_SEARCHES_SLEEP=${FETCHER_BETWEEN_SEARCHES_SLEEP:-1}
|
||||
- FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP:-2}
|
||||
- FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP:-1.5}
|
||||
- FETCHER_LANGUAGE_DETECTION_MIN_CHAR=100
|
||||
# Selenium
|
||||
- SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT:-http://fetcher_app_selenium:80}
|
||||
- ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA:-https://ollamamodel.matitos.org}
|
||||
|
||||
Reference in New Issue
Block a user