Refactoring fetcher, working feeds and raw url writer
This commit is contained in:
@@ -101,95 +101,11 @@
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 54,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"('https://foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert',\n",
|
||||
" 'foxnews.com')"
|
||||
]
|
||||
},
|
||||
"execution_count": 54,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# !pip install trafilatura trafilatura[all] cchardet\n",
|
||||
"import courlan\n",
|
||||
"url = \"https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n",
|
||||
"url = \"https://foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n",
|
||||
"courlan.check_url(url)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 48,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import newspaper\n",
|
||||
"\n",
|
||||
"article = newspaper.article(url)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 49,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"datetime.datetime(2025, 3, 4, 4, 0, 31, tzinfo=tzoffset(None, -18000))"
|
||||
]
|
||||
},
|
||||
"execution_count": 49,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"article.publish_date"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# !pip install trafilatura\n",
|
||||
"import trafilatura\n",
|
||||
@@ -197,9 +113,18 @@
|
||||
"\n",
|
||||
"url = \"https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n",
|
||||
"# url = \"https://www.missingkids.org/poster/USVA/VA25-0820/1\"\n",
|
||||
"url = \"https://www.bloomberg.com/news/articles/2025-03-12/eu-launches-metals-tariff-retaliation-on-26-billion-of-us-goods\"\n",
|
||||
"\n",
|
||||
"# Fetch\n",
|
||||
"doc = trafilatura.fetch_url(url)\n",
|
||||
"doc = trafilatura.fetch_url(url)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Content & metadata\n",
|
||||
"metadata = trafilatura.extract_metadata(doc)\n",
|
||||
"content = trafilatura.extract(doc)"
|
||||
@@ -207,40 +132,9 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'author': 'Audrey Conklin',\n",
|
||||
" 'body': <Element body at 0x7e22813ce400>,\n",
|
||||
" 'categories': [],\n",
|
||||
" 'comments': None,\n",
|
||||
" 'commentsbody': <Element body at 0x7e22813ce180>,\n",
|
||||
" 'date': '2025-03-03',\n",
|
||||
" 'description': \"Disgraced parenting blogger and mom of six Ruby Franke's \"\n",
|
||||
" '\"power\" and public image\" allowed her crimes against her '\n",
|
||||
" 'children to go \"unchecked,\" according to a defense attorney.',\n",
|
||||
" 'filedate': '2025-03-08',\n",
|
||||
" 'fingerprint': None,\n",
|
||||
" 'hostname': 'foxnews.com',\n",
|
||||
" 'id': None,\n",
|
||||
" 'image': 'https://static.foxnews.com/foxnews.com/content/uploads/2024/03/967e1c1b-Franke.jpg',\n",
|
||||
" 'language': None,\n",
|
||||
" 'license': None,\n",
|
||||
" 'pagetype': 'article',\n",
|
||||
" 'raw_text': None,\n",
|
||||
" 'sitename': 'Fox News',\n",
|
||||
" 'tags': [],\n",
|
||||
" 'text': None,\n",
|
||||
" 'title': \"Utah mommy blogger Ruby Franke's power, public image allowed child \"\n",
|
||||
" \"abuse to go 'unchecked': expert\",\n",
|
||||
" 'url': 'https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert'}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pprint(metadata.as_dict())"
|
||||
]
|
||||
@@ -263,85 +157,50 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"CPU times: user 18.6 ms, sys: 40 μs, total: 18.7 ms\n",
|
||||
"Wall time: 18 ms\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'en'"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"'''\n",
|
||||
"!pip install lingua-language-detector\n",
|
||||
"import lingua\n",
|
||||
"ld = lingua.LanguageDetectorBuilder.from_all_languages().build()\n",
|
||||
"l = ld.detect_language_of(content)\n",
|
||||
"'''\n",
|
||||
"# !pip install newspaper4k\n",
|
||||
"# !pip install langdetect \n",
|
||||
"import newspaper\n",
|
||||
"import langdetect\n",
|
||||
"langdetect.DetectorFactory.seed = 0\n",
|
||||
"langdetect.detect(content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# !pip install newspaper4k"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import newspaper\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# url = \"https://www.missingkids.org/poster/USVA/VA25-0820/1\"\n",
|
||||
"#url = \"https://www.waff.com/2025/03/11/colbert-heights-high-school-employee-arrested-child-abuse/\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"#url = \"https://www.bloomberg.com/news/articles/2025-03-12/eu-launches-metals-tariff-retaliation-on-26-billion-of-us-goods\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"url = \"https://apnews.com/article/canada-trump-us-tariffs-steel-2517a6a2baf0596cb1a43d3a7d1e7939\"\n",
|
||||
"url = \"https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n",
|
||||
"url = \"https://www.missingkids.org/poster/USVA/VA25-0820/1\"\n",
|
||||
"#url = \"https://www.ft.com/content/6d7c6915-4ceb-43fc-9896-590036b12a87\"\n",
|
||||
"#url = \"https://www.lanacion.com.ar/politica/milei-en-bahia-blanca-un-viaje-sorpresa-para-frenar-las-criticas-y-mostrar-cercania-nid12032025/\"\n",
|
||||
"#url = \"https://www.missingkids.org/poster/NCMC/2043547/1\"\n",
|
||||
"\n",
|
||||
"article = newspaper.article(url)\n",
|
||||
"try:\n",
|
||||
" article = newspaper.article(url)\n",
|
||||
"except newspaper.ArticleException as e:\n",
|
||||
" print(\"ArticleException: {}\".format(str(e)))\n",
|
||||
"except Exception as e:\n",
|
||||
" print(\"Err: {}\".format(str(e)))\n",
|
||||
"\n",
|
||||
"url_photo = set([i for i in article.images if \"api.missingkids.org/photographs\" in i])"
|
||||
"# url_photo = set([i for i in article.images if \"api.missingkids.org/photographs\" in i])\n",
|
||||
"# article.is_valid_url(), article.is_parsed, article.is_media_news(), article.is_valid_body()\n",
|
||||
"article.meta_data\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -351,6 +210,13 @@
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
||||
Reference in New Issue
Block a user