Working fetch search, refactoring DB towards source search
This commit is contained in:
123
1-DB.ipynb
123
1-DB.ipynb
File diff suppressed because one or more lines are too long
@@ -2,15 +2,80 @@
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 33,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import newspaper\n",
|
||||
"url = \"http://www.missingkids.org/poster/NCMC/2045193/1\"\n",
|
||||
"#url = \"https://www.missingkids.org/new-poster/NCMC/2045193/1\"\n",
|
||||
"# !pip install git+https://github.com/tasos-py/Search-Engines-Scraper.git\n",
|
||||
"import search_engines\n",
|
||||
"\n",
|
||||
"art = newspaper.article(url)"
|
||||
"engine = search_engines.Bing()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Searching Bing \n",
|
||||
" \r"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"results = engine.search('news: \"child abuse\"', pages=2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 40,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"engine = search_engines.search_engines_dict[\"brave\"]()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 43,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Searching Brave \n",
|
||||
" \r"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"query = 'news: child abuse'\n",
|
||||
"r = engine.search(query, pages=2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 44,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'_results': []}"
|
||||
]
|
||||
},
|
||||
"execution_count": 44,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"r.__dict__"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -18,8 +83,57 @@
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 46,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"newspaper.exceptions.ArticleBinaryDataException"
|
||||
]
|
||||
},
|
||||
"execution_count": 46,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"art.__dict__"
|
||||
"import newspaper\n",
|
||||
"newspaper.ArticleBinaryDataException"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"'''\n",
|
||||
"import newspaper\n",
|
||||
"\n",
|
||||
"url = 'https://www.missingkids.org/poster/USVA/VA25-0820/1'\n",
|
||||
"art_1 = newspaper.article(url)\n",
|
||||
"url = 'https://www.missingkids.org/poster/NCMC/2045193/1'\n",
|
||||
"art_2 = newspaper.article(url)\n",
|
||||
"'''"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -44,15 +158,8 @@
|
||||
"l = client.list()\n",
|
||||
"list_models = [m.get(\"model\") for m in l.model_dump().get(\"models\")]\n",
|
||||
"\n",
|
||||
"list_models"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(list_models)\n",
|
||||
"\n",
|
||||
"for m in list_models:\n",
|
||||
" context_key = [ k for k in client.show(m).model_dump().get(\"modelinfo\").keys() if \"context_length\" in k]\n",
|
||||
" if (len(context_key) != 1):\n",
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
conda create -n matitos_urls python=3.12
|
||||
conda activate matitos_urls
|
||||
pip install django psycopg[binary] django-rq
|
||||
pip install feedparser python-dateutil newspaper4k lxml[html_clean]
|
||||
pip install feedparser python-dateutil newspaper4k lxml[html_clean] googlenewsdecoder gnews duckduckgo_search GoogleNews
|
||||
```
|
||||
|
||||
* From automated inspectdb
|
||||
@@ -11,38 +11,59 @@ pip install feedparser python-dateutil newspaper4k lxml[html_clean]
|
||||
# 1) Inspect DB, generate models.py
|
||||
python manage.py inspectdb
|
||||
|
||||
# 2) models.py, within class Urls, add:
|
||||
# 2) Modify models.py
|
||||
|
||||
# URLS:
|
||||
class Urls(models.Model):
|
||||
class STATUS_ENUM(models.TextChoices):
|
||||
RAW = "raw"
|
||||
ERROR = "error"
|
||||
VALID = "valid"
|
||||
UNKNOWN = "unknown"
|
||||
INVALID = "invalid"
|
||||
DUPLICATE = "duplicate"
|
||||
RAW = "raw", "Raw"
|
||||
ERROR = "error", "Error"
|
||||
VALID = "valid", "Valid"
|
||||
UNKNOWN = "unknown", "Unknown"
|
||||
INVALID = "invalid", "Invalid"
|
||||
DUPLICATE = "duplicate", "Duplicate"
|
||||
|
||||
# Update status
|
||||
status = models.TextField(choices=STATUS_ENUM, default=STATUS_ENUM.RAW) # This field type is a guess.
|
||||
url = models.TextField(unique=True)
|
||||
ts_fetch = models.DateTimeField(auto_now_add=True)
|
||||
status = models.TextField(choices=STATUS_ENUM.choices, default=STATUS_ENUM.RAW) # This field type is a guess.
|
||||
|
||||
# To class Meta, add default ordering
|
||||
class Meta:
|
||||
managed = False
|
||||
db_table = 'urls' # db_table = '{}_urls'.format(project_name)
|
||||
db_table = 'urls'
|
||||
ordering = ["-ts_fetch"]
|
||||
|
||||
# Fields default:
|
||||
ts_fetch = models.DateTimeField(auto_now_add=True)
|
||||
status = models.TextField(default='raw') # This field type is a guess.
|
||||
# SEARCH:
|
||||
class Search(models.Model):
|
||||
class TYPE_ENUM(models.TextChoices):
|
||||
RSS_FEED = "rss_feed", "RSS_Feed"
|
||||
KEYWORD_SEARCH = "keyword_search", "Keyword_Search"
|
||||
URL_HOST = "url_host", "URL_Host"
|
||||
|
||||
# URLContent:
|
||||
from django.contrib.postgres.fields import ArrayField
|
||||
id = models.SmallAutoField(primary_key=True)
|
||||
search = models.TextField(unique=True)
|
||||
type = models.TextField(choices=TYPE_ENUM.choices) # This field type is a guess.
|
||||
|
||||
# URL_CONTENT:
|
||||
class UrlContent(models.Model):
|
||||
id_url = models.OneToOneField('Urls', models.DO_NOTHING, db_column='id_url', primary_key=True)
|
||||
date_published = models.DateTimeField(blank=True, null=True)
|
||||
title = models.TextField(blank=True, null=True)
|
||||
description = models.TextField(blank=True, null=True)
|
||||
content = models.TextField(blank=True, null=True)
|
||||
valid_content = models.BooleanField(blank=True, null=True)
|
||||
language = models.CharField(max_length=2, blank=True, null=True)
|
||||
keywords = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
|
||||
tags = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
|
||||
authors = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
|
||||
image_main_url = models.TextField(blank=True, null=True)
|
||||
images_url = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
|
||||
videos_url = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
|
||||
url_host = models.TextField(blank=True, null=True)
|
||||
site_name = models.TextField(blank=True, null=True)
|
||||
|
||||
# TODO: Associate db_table name with a prefix on project_name
|
||||
class Meta:
|
||||
db_table = 'urls' # db_table = '{}_urls'.format(project_name)
|
||||
```
|
||||
|
||||
* Environment variables
|
||||
@@ -55,6 +76,9 @@ DB_PORT=${DB_NAME:-5432}
|
||||
|
||||
REDIS_HOST=${REDIS_HOST:-localhost}
|
||||
REDIS_PORT=${REDIS_PORT:-6379}
|
||||
|
||||
# Default RQ queue timeout
|
||||
RQ_DEFAULT_TIMEOUT=${REDIS_PORT:-900}
|
||||
```
|
||||
|
||||
* Django DB
|
||||
|
||||
@@ -0,0 +1,26 @@
|
||||
# Generated by Django 5.1.7 on 2025-03-19 09:06
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('api', '0001_initial'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.DeleteModel(
|
||||
name='Feed',
|
||||
),
|
||||
migrations.DeleteModel(
|
||||
name='WebsiteOfInterest',
|
||||
),
|
||||
migrations.DeleteModel(
|
||||
name='WebsiteToFilter',
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name='urls',
|
||||
options={'managed': False, 'ordering': ['-ts_fetch']},
|
||||
),
|
||||
]
|
||||
@@ -2,18 +2,15 @@ from django.db import models
|
||||
from django.contrib.postgres.fields import ArrayField
|
||||
|
||||
# Create your models here.
|
||||
class Feed(models.Model):
|
||||
id = models.SmallAutoField(primary_key=True)
|
||||
rss_feed = models.TextField(unique=True)
|
||||
|
||||
class Meta:
|
||||
managed = False
|
||||
db_table = 'feed'
|
||||
|
||||
|
||||
class Search(models.Model):
|
||||
class TYPE_ENUM(models.TextChoices):
|
||||
RSS_FEED = "rss_feed", "RSS_Feed"
|
||||
KEYWORD_SEARCH = "keyword_search", "Keyword_Search"
|
||||
URL_HOST = "url_host", "URL_Host"
|
||||
|
||||
id = models.SmallAutoField(primary_key=True)
|
||||
keyword_search = models.TextField(unique=True)
|
||||
search = models.TextField(unique=True)
|
||||
type = models.TextField(choices=TYPE_ENUM.choices) # This field type is a guess.
|
||||
|
||||
class Meta:
|
||||
managed = False
|
||||
@@ -77,6 +74,7 @@ class Urls(models.Model):
|
||||
class Meta:
|
||||
managed = False
|
||||
db_table = 'urls'
|
||||
ordering = ["-ts_fetch"]
|
||||
|
||||
|
||||
class UrlsDuplicate(models.Model):
|
||||
@@ -96,13 +94,4 @@ class UrlsSource(models.Model):
|
||||
class Meta:
|
||||
managed = False
|
||||
db_table = 'urls_source'
|
||||
unique_together = (('id_url', 'id_source'),)
|
||||
|
||||
|
||||
class WebsiteOfInterest(models.Model):
|
||||
id = models.SmallAutoField(primary_key=True)
|
||||
url_host = models.TextField(unique=True)
|
||||
|
||||
class Meta:
|
||||
managed = False
|
||||
db_table = 'website_of_interest'
|
||||
unique_together = (('id_url', 'id_source'),)
|
||||
@@ -12,7 +12,6 @@ logger = get_logger()
|
||||
|
||||
class DB_Handler():
|
||||
def __init__(self):
|
||||
logger.debug("Initializing URL DB Handler")
|
||||
# Inserting raw URL, cache time: 1 day
|
||||
self._cache_timeout_insert_url = 86400
|
||||
# Processing error URL, cache time: 2 days
|
||||
@@ -37,16 +36,15 @@ class DB_Handler():
|
||||
else:
|
||||
return cache.get(cache_key) is not None
|
||||
|
||||
def insert_raw_urls(self, urls, source):
|
||||
|
||||
def clean_protocol(url):
|
||||
# http:// -> https://
|
||||
url = url.replace("http://", "https://")
|
||||
# "" -> https://
|
||||
if not (url.startswith("https://")):
|
||||
url = "https://" + url
|
||||
return url
|
||||
|
||||
def _clean_protocol(self, url):
|
||||
# http:// -> https://
|
||||
url = url.replace("http://", "https://")
|
||||
# "" -> https://
|
||||
if not (url.startswith("https://")):
|
||||
url = "https://" + url
|
||||
return url
|
||||
|
||||
def insert_raw_urls(self, urls, source):
|
||||
try:
|
||||
logger.debug("Inserting raw URLs")
|
||||
# Empty?
|
||||
@@ -55,7 +53,7 @@ class DB_Handler():
|
||||
return
|
||||
|
||||
# Default protocol https://
|
||||
urls_clean = [clean_protocol(url) for url in urls]
|
||||
urls_clean = [self._clean_protocol(url) for url in urls]
|
||||
|
||||
# Get the source (create if not exists)
|
||||
source_obj, created = Source.objects.get_or_create(source=source)
|
||||
@@ -90,7 +88,7 @@ class DB_Handler():
|
||||
UrlsSource.objects.bulk_create([UrlsSource(id_source=source_obj, id_url=url_obj) for url_obj in bulk_created_urls], ignore_conflicts=True)
|
||||
except IntegrityError as e:
|
||||
### Fallback to one-by-one insert
|
||||
logger.debug("bulk_create exception while inserting raw URLs, falling back to non-bulk method")
|
||||
logger.debug("bulk_create exception while inserting raw URLs (fails if duplicated URL), falling back to non-bulk method")
|
||||
# One by one
|
||||
for url in urls_to_insert:
|
||||
# URL
|
||||
@@ -177,9 +175,16 @@ class DB_Handler():
|
||||
set_status(obj_url, Urls.STATUS_ENUM.ERROR)
|
||||
# Next URL
|
||||
return
|
||||
|
||||
# Invalid? e.g. binary data
|
||||
if (dict_url_data.get("override_status") == "invalid"):
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.INVALID)
|
||||
# Next URL
|
||||
return
|
||||
|
||||
##### Canonical URL different? -> Duplicate
|
||||
if (dict_url_data.get("url") != dict_url_data.get("url_canonical")):
|
||||
if (dict_url_data.get("url_canonical") is not None) and(dict_url_data.get("url") != dict_url_data.get("url_canonical")):
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.DUPLICATE)
|
||||
|
||||
@@ -194,6 +199,10 @@ class DB_Handler():
|
||||
# URLs duplciate association
|
||||
obj_urls_duplicate, created = UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url)
|
||||
|
||||
# TODO: return obj_url_canonical so as to directly process the recently inserted URL
|
||||
# Whever this function is called, add:
|
||||
# self._process_single_url(obj_url_canonical, status_pattern_match, raise_exception_on_error)
|
||||
|
||||
# Next URL
|
||||
return
|
||||
|
||||
@@ -273,6 +282,7 @@ class DB_Handler():
|
||||
for obj_url in error_urls:
|
||||
# URL ID cached? -> Tried to process recently already, skip
|
||||
if (self._is_cached_key("error_{}".format(obj_url.id), hash_encoded=False)):
|
||||
logger.debug("Already cached URL ID: {}".format(obj_url.id))
|
||||
num_urls_skipped += 1
|
||||
continue
|
||||
|
||||
@@ -299,7 +309,7 @@ class DB_Handler():
|
||||
missingkids_urls = Urls.objects.order_by("-ts_fetch").filter(
|
||||
(Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster"))
|
||||
&
|
||||
(Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID))
|
||||
(Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.ERROR))
|
||||
)[:batch_size]
|
||||
|
||||
# Per URL
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from .db_utils import DB_Handler
|
||||
from ..models import Feed
|
||||
from ..models import Search
|
||||
import feedparser
|
||||
import dateutil
|
||||
import traceback
|
||||
@@ -15,7 +15,7 @@ class FetchFeeds():
|
||||
logger.debug("Starting FetchFeeds.run()")
|
||||
|
||||
# Get feeds
|
||||
list_url_feeds = list(Feed.objects.values_list('rss_feed', flat=True))
|
||||
list_url_feeds = list(Search.objects.filter(type=Search.TYPE_ENUM.RSS_FEED).values_list('search', flat=True))
|
||||
logger.debug("Fetching from feeds: {}".format(list_url_feeds))
|
||||
|
||||
# Process via RSS feeds
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from .db_utils import DB_Handler
|
||||
from ..models import WebsiteOfInterest
|
||||
from ..models import Search
|
||||
import newspaper
|
||||
import traceback
|
||||
from .logger import get_logger
|
||||
@@ -14,7 +14,7 @@ class FetchParser():
|
||||
logger.debug("Starting FetchParser.run() for {}")
|
||||
|
||||
# Get URL hosts
|
||||
list_url_host = list(WebsiteOfInterest.objects.values_list('url_host', flat=True))
|
||||
list_url_host = list(Search.objects.filter(type=Search.TYPE_ENUM.URL_HOST).values_list('search', flat=True))
|
||||
logger.debug("Fetching news by parsing URL hosts: {}".format(list_url_host))
|
||||
|
||||
# Process newspaper4k build method
|
||||
|
||||
75
app_urls/api/src/fetch_search.py
Normal file
75
app_urls/api/src/fetch_search.py
Normal file
@@ -0,0 +1,75 @@
|
||||
from .db_utils import DB_Handler
|
||||
from ..models import Search
|
||||
import traceback
|
||||
import time
|
||||
from .fetch_search_utils import search_gnews, search_ddg, search_googlenews_general, search_googlenews_news
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
class FetchSearcher():
|
||||
def __init__(self) -> None:
|
||||
logger.debug("Initializing Fetcher Searcher")
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
logger.debug("Starting FetchSearcher.run()")
|
||||
|
||||
# Get keyword searches of interest
|
||||
list_keyword_search = list(Search.objects.filter(type=Search.TYPE_ENUM.KEYWORD_SEARCH).values_list('search', flat=True))
|
||||
# Get URL host of interest
|
||||
list_url_host = list(Search.objects.filter(type=Search.TYPE_ENUM.URL_HOST).values_list('search', flat=True))
|
||||
|
||||
# TODO: allintitle: "child abuse"
|
||||
# TODO: intitle: "child abuse"
|
||||
# list_keyword_search + ['allintitle: "{}"'.format(s) for s in list_keyword_search] + ['intitle: "{}"'.format(s) for s in list_keyword_search]
|
||||
# Merge searches
|
||||
list_search = list_keyword_search + ["site:{}".format(u) for u in list_url_host]
|
||||
logger.debug("Fetching from keyword search: {}".format(list_search))
|
||||
|
||||
# Search
|
||||
for keyword_search in list_search:
|
||||
# TODO: language & country customization
|
||||
|
||||
# DDG News
|
||||
time.sleep(5)
|
||||
raw_urls, source = search_ddg(keyword_search, category="news", timelimit="d", max_results=None, region = "wt-wt")
|
||||
# Write to DB
|
||||
DB_Handler().insert_raw_urls(raw_urls, source)
|
||||
|
||||
# GNews
|
||||
time.sleep(5)
|
||||
raw_urls, source = search_gnews(keyword_search, language="en", country="US")
|
||||
# Write to DB
|
||||
DB_Handler().insert_raw_urls(raw_urls, source)
|
||||
|
||||
# DDG Text
|
||||
time.sleep(5)
|
||||
raw_urls, source = search_ddg(keyword_search, category="text", timelimit="d", max_results=None, region = "wt-wt")
|
||||
# Write to DB
|
||||
DB_Handler().insert_raw_urls(raw_urls, source)
|
||||
|
||||
# GoogleNews news
|
||||
time.sleep(5)
|
||||
raw_urls, source = search_googlenews_news(keyword_search, period="1d", language="en", country="US")
|
||||
# Write to DB
|
||||
DB_Handler().insert_raw_urls(raw_urls, source)
|
||||
# GoogleNews general
|
||||
time.sleep(5)
|
||||
raw_urls, source = search_googlenews_general(keyword_search, period="1d", language="en", country="US", max_pages=5)
|
||||
# Write to DB
|
||||
DB_Handler().insert_raw_urls(raw_urls, source)
|
||||
|
||||
# TODO:
|
||||
# SearxNG
|
||||
"""
|
||||
period = "day"
|
||||
for searx_instance in get_searxng_instances():
|
||||
dict_params_news = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "news", "period": period}
|
||||
dict_params_general = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "general", "period": period}
|
||||
# Append thread
|
||||
FetcherSearxNews(**dict_params_news).fetch_articles(self.db_handler)
|
||||
FetcherSearxNews(**dict_params_general).fetch_articles(self.db_handler)"
|
||||
"""
|
||||
# TODO: https://github.com/tasos-py/Search-Engines-Scraper/tree/master
|
||||
except Exception as e:
|
||||
logger.warning("Exception in FetchSearcher.run(): {}\n{}".format(e, traceback.format_exc()))
|
||||
129
app_urls/api/src/fetch_search_utils.py
Normal file
129
app_urls/api/src/fetch_search_utils.py
Normal file
@@ -0,0 +1,129 @@
|
||||
import traceback
|
||||
import random
|
||||
import time
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
from googlenewsdecoder import gnewsdecoder
|
||||
from gnews import GNews
|
||||
from duckduckgo_search import DDGS
|
||||
from GoogleNews import GoogleNews
|
||||
|
||||
###########################################################################
|
||||
def decode_gnews_urls(encoded_urls):
|
||||
# DecodeURLs
|
||||
list_decoded_urls = []
|
||||
for url in encoded_urls:
|
||||
try:
|
||||
# Decode URL, with interval time to avoid block
|
||||
decoded_url = gnewsdecoder(url, interval=5)
|
||||
# Ok?
|
||||
if decoded_url.get("status"):
|
||||
list_decoded_urls.append(decoded_url["decoded_url"])
|
||||
else:
|
||||
logger.warning("Error decoding news.google.com, URL {}\nMessage: {}".format(url, decoded_url["message"]))
|
||||
except Exception as e:
|
||||
logger.warning("Error decoding news.google.com, URL: {}\n{}".format(url, traceback.format_exc()))
|
||||
return list_decoded_urls
|
||||
|
||||
###########################################################################
|
||||
|
||||
def search_gnews(keyword_search, period="1d", language="en", country="US", max_results=100):
|
||||
# [source] [category] [period] [language-country] [max_results]
|
||||
source = "gnews {} {} {}-{} max_results={}".format("news", period, language, country, max_results).replace("None", "").strip()
|
||||
logger.debug("Searching: {} --- Source:{}".format(keyword_search, source))
|
||||
|
||||
# Get news
|
||||
results_gnews = GNews(language=language, country=country).get_news(keyword_search)
|
||||
# Get list of encoded urls
|
||||
encoded_urls = [e.get("url") for e in results_gnews]
|
||||
# Decode
|
||||
list_decoded_urls = decode_gnews_urls(encoded_urls)
|
||||
return list_decoded_urls, source
|
||||
|
||||
###########################################################################
|
||||
|
||||
def search_ddg(keyword_search, category="news", timelimit="d", max_results=None, region="wt-wt"):
|
||||
# [source] [category] [period] [language-country] [max_results]
|
||||
source = "ddg {} {} {} max_results={}".format(category, timelimit, region, max_results).replace("None", "").strip()
|
||||
logger.debug("Searching: {} --- Source:{}".format(keyword_search, source))
|
||||
|
||||
# region="{}-{}".format(langauge, country.lower())
|
||||
# timelimit= # Options: d, w, m
|
||||
# max_results # max number of results. If None, returns results only from the first response. Defaults to None
|
||||
|
||||
if (category == "news"):
|
||||
news = DDGS().news(keyword_search, region=region, timelimit=timelimit, max_results=max_results)
|
||||
urls = [e.get("url") for e in news]
|
||||
if (category == "text"):
|
||||
news = DDGS().text(keyword_search, region=region, timelimit=timelimit, max_results=max_results)
|
||||
urls = [e.get("href") for e in news]
|
||||
|
||||
return urls, source
|
||||
###########################################################################
|
||||
|
||||
def search_googlenews_news(keyword_search, period="1d", language="en", country="US"):
|
||||
category = "news"
|
||||
# [source] [category] [period] [language-country]
|
||||
source = "googlenews {} {} {}-{}".format(category, period, language, country).replace("None", "").strip()
|
||||
logger.debug("Searching: {} --- Source:{}".format(keyword_search, source))
|
||||
|
||||
# Initialize
|
||||
googlenews = GoogleNews(period=period, lang=language, region=country)
|
||||
googlenews.enableException(True)
|
||||
|
||||
try:
|
||||
# Search
|
||||
googlenews.get_news(keyword_search)
|
||||
# Fetch
|
||||
encoded_urls = googlenews.get_links()
|
||||
# Decode
|
||||
urls = decode_gnews_urls(encoded_urls)
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching {}: {}\n{}".format(source, str(e), traceback.format_exc()))
|
||||
urls = []
|
||||
|
||||
return urls, source
|
||||
|
||||
def search_googlenews_general(keyword_search, period="1d", language="en", country="US", max_pages=5):
|
||||
category="general"
|
||||
# [source] [category] [period] [language-country] [max_results]
|
||||
source = "googlenews {} {} {}-{} max_pages={}".format(category, period, language, country, max_pages).replace("None", "").strip()
|
||||
logger.debug("Searching: {} --- Source:{}".format(keyword_search, source))
|
||||
|
||||
# Initialize
|
||||
googlenews = GoogleNews(period=period, lang=language, region=country)
|
||||
googlenews.enableException(True)
|
||||
|
||||
try:
|
||||
set_links = set()
|
||||
# Search
|
||||
googlenews.search(keyword_search)
|
||||
|
||||
# Iterate pages
|
||||
for i in range(max_pages):
|
||||
time.sleep(random.uniform(1, 2.5))
|
||||
num_before = len(set_links)
|
||||
|
||||
# Get page
|
||||
try:
|
||||
links = googlenews.page_at(i+1)
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching page in GoogleNews {}: {}".format(source, str(e)))
|
||||
break
|
||||
# Links
|
||||
for l in links:
|
||||
# 'link': 'https://uk.news.yahoo.com/leaving-neverland-2-michael-jackson-lawyer-channel-4-102017088.html&ved=2ahUKEwjl38eJm5aMAxVvqJUCHXgnGzwQxfQBegQICRAC&usg=AOvVaw1osa6b3o_xXfcNinMDpLoK'
|
||||
set_links.add( l.get("link").split("&ved=")[0] )
|
||||
# Finished?
|
||||
if (num_before == len(set_links)):
|
||||
break
|
||||
# To list
|
||||
urls = list(set_links)
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching {}: {}\n{}".format(source, str(e), traceback.format_exc()))
|
||||
urls = []
|
||||
|
||||
return urls, source
|
||||
|
||||
###########################################################################
|
||||
@@ -12,6 +12,9 @@ def process_url(url):
|
||||
try:
|
||||
# Process
|
||||
article = newspaper.article(url)
|
||||
except newspaper.ArticleBinaryDataException:
|
||||
logger.warning("ArticleException for input URL {}\n{}".format(url, str(e)))
|
||||
return {"override_status": "invalid"}
|
||||
except newspaper.ArticleException as e:
|
||||
logger.warning("ArticleException for input URL {}\n{}".format(url, str(e)))
|
||||
return None
|
||||
|
||||
@@ -2,6 +2,7 @@ from django_rq import job
|
||||
|
||||
from .src.fetch_feed import FetchFeeds
|
||||
from .src.fetch_parser import FetchParser
|
||||
from .src.fetch_search import FetchSearcher
|
||||
from .src.db_utils import DB_Handler
|
||||
'''
|
||||
from src.fetch_search import FetchSearcher
|
||||
@@ -21,16 +22,20 @@ def background_task(process_type: str):
|
||||
FetchFeeds().run()
|
||||
elif (process_type == "fetch_parser"):
|
||||
FetchParser().run()
|
||||
# TODO: ENCODE BATCH_SIZE IN PROCESS_tYPE..
|
||||
elif (process_type == "process_raw_urls"):
|
||||
DB_Handler().process_raw_urls(batch_size=50)
|
||||
elif (process_type == "process_error_urls"):
|
||||
DB_Handler().process_error_urls(batch_size=50)
|
||||
elif (process_type == "process_missing_kids_urls"):
|
||||
DB_Handler().process_missing_kids_urls(batch_size=50)
|
||||
elif ("process_missing_kids_urls" in process_type):
|
||||
elif (process_type == "fetch_search"):
|
||||
FetchSearcher().run()
|
||||
#elif (process_type == "fetch_missingkids"):
|
||||
# FetchMissingKids().run()
|
||||
elif ("process_" in process_type):
|
||||
# Batch size encoded in URL
|
||||
batch_size = int(process_type.split("_")[-1])
|
||||
DB_Handler().process_missing_kids_urls(batch_size=batch_size)
|
||||
# Task type
|
||||
if ("process_raw_urls" in process_type):
|
||||
DB_Handler().process_raw_urls(batch_size=batch_size)
|
||||
elif ("process_error_urls" in process_type):
|
||||
DB_Handler().process_error_urls(batch_size=batch_size)
|
||||
elif ("process_missing_kids_urls" in process_type):
|
||||
DB_Handler().process_missing_kids_urls(batch_size=batch_size)
|
||||
else:
|
||||
logger.info("Task unknown!: {}".format(process_type))
|
||||
|
||||
@@ -47,15 +52,7 @@ def background_task(process_type: str):
|
||||
MissingKidsFetch(db_handler, num_pages=4).run()
|
||||
elif (process_type == "fetch_missing_kids_full"):
|
||||
MissingKidsFetch(db_handler, num_pages=100000).run()
|
||||
|
||||
elif (process_type == "update_missing_kids_status_reduced"):
|
||||
MissingKidsStatus(cred.db_connect_info, cred.redis_connect_info, num_urls=50).update_missing_kids_status()
|
||||
elif (process_type == "update_missing_kids_status_full"):
|
||||
MissingKidsStatus(cred.db_connect_info, cred.redis_connect_info, num_urls=None).update_missing_kids_status()
|
||||
|
||||
elif (process_type == "update_error_urls"):
|
||||
UpdateErrorURLs(cred.db_connect_info, cred.redis_connect_info, num_urls=100).update_error_urls_status()
|
||||
|
||||
else:
|
||||
logger.error("Task error, unknown type: {}".format(process_type))
|
||||
return
|
||||
|
||||
@@ -9,11 +9,22 @@ logger = get_logger()
|
||||
|
||||
def trigger_task(request, task):
|
||||
"""View that enqueues a task."""
|
||||
|
||||
"""
|
||||
if ("fetch_" in task):
|
||||
priority = "low"
|
||||
job_timeout="30m"
|
||||
elif ("process_" in task):
|
||||
priority = "medium"
|
||||
job_timeout="30m"
|
||||
"""
|
||||
|
||||
queue = django_rq.get_queue('default') # Get the default queue
|
||||
job = queue.enqueue(background_task, task)
|
||||
job = queue.enqueue(background_task, task, job_timeout="30m")
|
||||
return JsonResponse({"message": "Task has been enqueued!", "job_id": job.id})
|
||||
|
||||
def link_list(request):
|
||||
prefix = "http://localhost:8000/api"
|
||||
links = ["fetch_feeds", "fetch_parser", "process_raw_urls", "process_error_urls", "process_missing_kids_urls_50", "process_missing_kids_urls_500000"]
|
||||
return JsonResponse({"links": ["http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id"] + [os.path.join(prefix, l) for l in links]})
|
||||
links = ["fetch_feeds", "fetch_parser", "fetch_search", "process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_500000"]
|
||||
db_links = ["http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id&limit=500"]
|
||||
return JsonResponse({"links": db_links + [os.path.join(prefix, l) for l in links]})
|
||||
|
||||
@@ -106,7 +106,7 @@ RQ_QUEUES = {
|
||||
'HOST': os.environ.get("REDIS_HOST", "localhost"),
|
||||
'PORT': os.environ.get("REDIS_PORT", 6379),
|
||||
'DB': os.environ.get("REDIS_DB", 0),
|
||||
'DEFAULT_TIMEOUT': os.environ.get("REDIS_DEFAULT_TIMEOUT", 360),
|
||||
'DEFAULT_TIMEOUT': os.environ.get("RQ_DEFAULT_TIMEOUT", 900),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user