Unknown instead of error for fetched urls
This commit is contained in:
@@ -6,8 +6,9 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# !pip install flask\n",
|
"!uvicorn app:app --workers 1 --log-level info --port 5001\n",
|
||||||
"!python app.py"
|
"#!uvicorn app:app --reload --log-level debug --port 8000\n",
|
||||||
|
"#!python app.py"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ from django.core.cache import cache
|
|||||||
from .logger import get_logger
|
from .logger import get_logger
|
||||||
logger = get_logger()
|
logger = get_logger()
|
||||||
import newspaper
|
import newspaper
|
||||||
|
import requests
|
||||||
import time
|
import time
|
||||||
import os
|
import os
|
||||||
from urllib.parse import unquote
|
from urllib.parse import unquote
|
||||||
@@ -39,7 +40,7 @@ def url_host_slowdown(url, url_host_slowdown_seconds):
|
|||||||
cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes
|
cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes
|
||||||
|
|
||||||
def process_url(url, paywall_bypass=False):
|
def process_url(url, paywall_bypass=False):
|
||||||
|
|
||||||
if (paywall_bypass):
|
if (paywall_bypass):
|
||||||
# TODO: Implement self-hosted instance
|
# TODO: Implement self-hosted instance
|
||||||
url_paywall_bypass_base = "https://marreta.pcdomanual.com"
|
url_paywall_bypass_base = "https://marreta.pcdomanual.com"
|
||||||
@@ -57,22 +58,46 @@ def process_url(url, paywall_bypass=False):
|
|||||||
logger.warning("ArticleException for input URL {}".format(url))
|
logger.warning("ArticleException for input URL {}".format(url))
|
||||||
return {"override_status": "invalid"}
|
return {"override_status": "invalid"}
|
||||||
except newspaper.ArticleException as e:
|
except newspaper.ArticleException as e:
|
||||||
|
|
||||||
|
# Too many requests or blocked for some reason
|
||||||
|
if ("Status code 403" in str(e.args)):
|
||||||
|
# TODO: cool down and retry once?, proxy/VPN, ...
|
||||||
|
logger.debug("TODO: process_url Implement code 403")
|
||||||
|
|
||||||
|
# Not found, either it doesn't exist or getting blocked...
|
||||||
|
if ("Status code 404" in str(e.args)):
|
||||||
|
# TODO: cool down and retry once?, proxy/VPN, ...
|
||||||
|
logger.debug("TODO: process_url Implement code 404")
|
||||||
|
|
||||||
# Too many requests? Cool down...
|
# Too many requests? Cool down...
|
||||||
if ("Status code 429" in str(e.args)):
|
if ("Status code 429" in str(e.args)):
|
||||||
# TODO: cool down and retry once?, proxy/VPN, ...
|
# TODO: cool down and retry once?, proxy/VPN, ...
|
||||||
logger.debug("TODO: process_url Implement code 429")
|
logger.debug("TODO: process_url Implement code 429")
|
||||||
|
|
||||||
# Unavailable for legal reasons
|
# Unavailable for legal reasons
|
||||||
if ("Status code 451" in str(e.args)):
|
if ("Status code 451" in str(e.args)):
|
||||||
# TODO: Bypass with VPN
|
# TODO: Bypass with VPN
|
||||||
logger.debug("TODO: process_url Implement code 451")
|
logger.debug("TODO: process_url Implement code 451")
|
||||||
|
|
||||||
# CloudFlare protection?
|
# CloudFlare protection?
|
||||||
if ("Website protected with Cloudflare" in str(e.args)):
|
if ("Website protected with Cloudflare" in str(e.args)):
|
||||||
logger.debug("TODO: process_url Implement bypass CloudFlare")
|
logger.debug("TODO: process_url Implement bypass CloudFlare")
|
||||||
|
|
||||||
# PerimeterX protection?
|
# PerimeterX protection?
|
||||||
if ("Website protected with PerimeterX" in str(e.args)):
|
if ("Website protected with PerimeterX" in str(e.args)):
|
||||||
logger.debug("TODO: process_url Implement bypass PerimeterX")
|
logger.debug("TODO: process_url Implement bypass PerimeterX")
|
||||||
|
|
||||||
logger.debug("ArticleException for input URL {}\n{}".format(url, str(e.args)))
|
logger.debug("ArticleException for input URL {}\n{}".format(url, str(e.args)))
|
||||||
|
|
||||||
|
# Try simple request, valid response but couldn't parse article? e.g. getting blocked? -> unknown
|
||||||
|
time.sleep(0.25)
|
||||||
|
r = requests.get(url_of_interest)
|
||||||
|
if (r.status_code == 200):
|
||||||
|
return {"override_status": "unknown"}
|
||||||
|
else:
|
||||||
|
# Another status code still... "error" or "unknown"
|
||||||
|
return {"override_status": "unknown"}
|
||||||
|
|
||||||
return None
|
return None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
|
logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
|
||||||
|
|||||||
@@ -1,65 +0,0 @@
|
|||||||
{
|
|
||||||
"SEARCH": {
|
|
||||||
"rss_feed": [
|
|
||||||
],
|
|
||||||
"url_host": [
|
|
||||||
"johnpilger.com",
|
|
||||||
"lapenseeecologique.com",
|
|
||||||
"partage-le.com",
|
|
||||||
"reflets.info",
|
|
||||||
"rezo.net",
|
|
||||||
"consortiumnews.com",
|
|
||||||
"disclose.ngo/fr",
|
|
||||||
"energieetenvironnement.com",
|
|
||||||
"global-climat.com",
|
|
||||||
"slashdot.org",
|
|
||||||
"lesamisdebartleby.wordpress.com",
|
|
||||||
"lundi.am",
|
|
||||||
"lvsl.fr",
|
|
||||||
"moderndiplomacy.eu",
|
|
||||||
"mrmondialisation.org",
|
|
||||||
"ourfiniteworld.com",
|
|
||||||
"southfront.org",
|
|
||||||
"simplicius76.substack.com",
|
|
||||||
"smoothiex12.blogspot.com",
|
|
||||||
"theintercept.com",
|
|
||||||
"wikileaks.org",
|
|
||||||
"contretemps.eu",
|
|
||||||
"indianpunchline.com",
|
|
||||||
"investigaction.net/fr",
|
|
||||||
"notechmagazine.com",
|
|
||||||
"terrestres.org",
|
|
||||||
"truthdig.com",
|
|
||||||
"tass.com",
|
|
||||||
"bastamag.net",
|
|
||||||
"counterpunch.org",
|
|
||||||
"energy-daily.com",
|
|
||||||
"fakirpresse.info",
|
|
||||||
"geopoliticalmonitor.com",
|
|
||||||
"huffingtonpost.fr",
|
|
||||||
"legrandsoir.info",
|
|
||||||
"les-crises.fr",
|
|
||||||
"liberation.fr",
|
|
||||||
"maitre-eolas.fr",
|
|
||||||
"marianne.net",
|
|
||||||
"mediapart.fr",
|
|
||||||
"metaefficient.com",
|
|
||||||
"monde-diplomatique.fr",
|
|
||||||
"paulcraigroberts.org",
|
|
||||||
"politis.fr",
|
|
||||||
"reporterre.net",
|
|
||||||
"rue89.com",
|
|
||||||
"theguardian.com/international",
|
|
||||||
"treehugger.com",
|
|
||||||
"unz.com",
|
|
||||||
"voltairenet.org",
|
|
||||||
"wsws.org"
|
|
||||||
],
|
|
||||||
"keyword_search": [
|
|
||||||
"society collapse"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"REGEX_PATTERN_STATUS_PRIORITY": [
|
|
||||||
[".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50]
|
|
||||||
]
|
|
||||||
}
|
|
||||||
@@ -1,34 +0,0 @@
|
|||||||
{
|
|
||||||
"SEARCH": {
|
|
||||||
"rss_feed": [
|
|
||||||
"https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC",
|
|
||||||
"https://feeds.feedburner.com/breitbart",
|
|
||||||
"https://feeds.feedburner.com/zerohedge/feed",
|
|
||||||
"https://moxie.foxnews.com/google-publisher/latest.xml",
|
|
||||||
"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=15837362",
|
|
||||||
"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100727362"
|
|
||||||
],
|
|
||||||
"url_host": [
|
|
||||||
"missingkids.org/poster",
|
|
||||||
"missingkids.org/new-poster",
|
|
||||||
"breitbart.com",
|
|
||||||
"zerohedge.com",
|
|
||||||
"foxnews.com",
|
|
||||||
"cnbc.com"
|
|
||||||
],
|
|
||||||
"keyword_search": [
|
|
||||||
"child abuse"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"REGEX_PATTERN_STATUS_PRIORITY": [
|
|
||||||
[".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50],
|
|
||||||
[".*cnbc\\.com\\/(video|quotes)\\/.*", "invalid", 75],
|
|
||||||
[".*foxnews\\.com\\/(video|category)\\/.*", "invalid", 75],
|
|
||||||
[".*radio.foxnews\\.com\\/.*", "invalid", 75],
|
|
||||||
[".*breitbart\\.com\\/(tag|author)\\/.*", "invalid", 75],
|
|
||||||
[".*zerohedge\\.com\\/(user)\\/.*", "invalid", 75],
|
|
||||||
[".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50],
|
|
||||||
[".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50],
|
|
||||||
[".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50]
|
|
||||||
]
|
|
||||||
}
|
|
||||||
@@ -12,8 +12,8 @@
|
|||||||
"import requests\n",
|
"import requests\n",
|
||||||
"from datetime import datetime, timedelta, timezone\n",
|
"from datetime import datetime, timedelta, timezone\n",
|
||||||
"\n",
|
"\n",
|
||||||
"admin_api_url = \"\"\n",
|
"admin_api_url = \"\" # .env -> GHOST_ADMIN_API_URL\n",
|
||||||
"admin_api_key = \"\"\n",
|
"admin_api_key = \"\" # .env -> GHOST_ADMIN_API_KEY\n",
|
||||||
"\n",
|
"\n",
|
||||||
"def _create_jwt(admin_api_key):\n",
|
"def _create_jwt(admin_api_key):\n",
|
||||||
" id_, secret = admin_api_key.split(':')\n",
|
" id_, secret = admin_api_key.split(':')\n",
|
||||||
|
|||||||
@@ -8,11 +8,206 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"url = \"https://onlinenewspapers.com/index.shtml\""
|
"url = \"https://onlinenewspapers.com/index.shtml\""
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'newspaper/0.9.3.1'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"\"\"\"\n",
|
||||||
|
"import newspaper\n",
|
||||||
|
"\n",
|
||||||
|
"newspaper.Config().__dict__\n",
|
||||||
|
"\n",
|
||||||
|
" 'requests_params': {'timeout': 7,\n",
|
||||||
|
" 'proxies': {},\n",
|
||||||
|
" 'headers': {'User-Agent': 'newspaper/0.9.3.1'}},\n",
|
||||||
|
"\"\"\"\n",
|
||||||
|
"import newspaper\n",
|
||||||
|
"newspaper.Config().browser_user_agent"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"\"\"\"\n",
|
||||||
|
" url (str): The url of the source (news website) to build. For example,\n",
|
||||||
|
" `https://www.cnn.com`.\n",
|
||||||
|
" dry (bool): If true, the source object will be constructed but not\n",
|
||||||
|
" downloaded or parsed.\n",
|
||||||
|
" only_homepage (bool): If true, the source object will only parse\n",
|
||||||
|
" the homepage of the source.\n",
|
||||||
|
" only_in_path (bool): If true, the source object will only\n",
|
||||||
|
" parse the articles that are in the same path as the source's\n",
|
||||||
|
" homepage. You can scrape a specific category this way.\n",
|
||||||
|
" Defaults to False.\n",
|
||||||
|
" input_html (str): The HTML of the source to parse. Use this to pass cached\n",
|
||||||
|
" HTML to the source object.\n",
|
||||||
|
" config (Configuration): A configuration object to use for the source.\n",
|
||||||
|
" kwargs: Any other keyword arguments to pass to the Source constructor.\n",
|
||||||
|
" If you omit the config object, you can add any configuration\n",
|
||||||
|
" options here.\n",
|
||||||
|
"\"\"\"\n",
|
||||||
|
"\n",
|
||||||
|
"url = \"https://www.lanacion.com.ar/deportes/\"\n",
|
||||||
|
"\n",
|
||||||
|
"newspaper_built = newspaper.build(url, only_in_path=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"newspaper_built.__dict__"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"newspaper_built.article_urls()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"url = \"https://www.lanacion.com.ar/\"\n",
|
||||||
|
"#url = \"https://www.lanacion.com.ar/deportes/\"\n",
|
||||||
|
"newspaper_built = newspaper.build(url)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"\"\"\"\n",
|
||||||
|
" url (str): The url of the source (news website) to build. For example,\n",
|
||||||
|
" `https://www.cnn.com`.\n",
|
||||||
|
" dry (bool): If true, the source object will be constructed but not\n",
|
||||||
|
" downloaded or parsed.\n",
|
||||||
|
" only_homepage (bool): If true, the source object will only parse\n",
|
||||||
|
" the homepage of the source.\n",
|
||||||
|
" only_in_path (bool): If true, the source object will only\n",
|
||||||
|
" parse the articles that are in the same path as the source's\n",
|
||||||
|
" homepage. You can scrape a specific category this way.\n",
|
||||||
|
" Defaults to False.\n",
|
||||||
|
" input_html (str): The HTML of the source to parse. Use this to pass cached\n",
|
||||||
|
" HTML to the source object.\n",
|
||||||
|
" config (Configuration): A configuration object to use for the source.\n",
|
||||||
|
" kwargs: Any other keyword arguments to pass to the Source constructor.\n",
|
||||||
|
" If you omit the config object, you can add any configuration\n",
|
||||||
|
" options here.\n",
|
||||||
|
"\"\"\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"cat = newspaper_built.categories[0]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"newspaper_built.categories_to_articles()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"newspaper_built.category_urls()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
" 'https://www.lanacion.com.ar/tema/futbol-argentino-tid57505/',\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"categories = newspaper_built.category_urls()\n",
|
||||||
|
"url_of_interest = \"https://www.lanacion.com.ar/sabado/todo-para-ellos-nid21042025/\"\n",
|
||||||
|
"\n",
|
||||||
|
"potential_categories = []\n",
|
||||||
|
"\n",
|
||||||
|
"for c in categories:\n",
|
||||||
|
" if (c in url_of_interest):\n",
|
||||||
|
" print(c, url_of_interest)\n",
|
||||||
|
" potential_categories.append(c)\n",
|
||||||
|
"\n",
|
||||||
|
"# Get longest length category"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"newspaper_built.article_urls()"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "matitos_urls",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
"language_info": {
|
"language_info": {
|
||||||
"name": "python"
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.12.9"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user