216 lines
5.7 KiB
Plaintext
216 lines
5.7 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"url = \"https://onlinenewspapers.com/index.shtml\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"'newspaper/0.9.3.1'"
|
|
]
|
|
},
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"\"\"\"\n",
|
|
"import newspaper\n",
|
|
"\n",
|
|
"newspaper.Config().__dict__\n",
|
|
"\n",
|
|
" 'requests_params': {'timeout': 7,\n",
|
|
" 'proxies': {},\n",
|
|
" 'headers': {'User-Agent': 'newspaper/0.9.3.1'}},\n",
|
|
"\"\"\"\n",
|
|
"import newspaper\n",
|
|
"newspaper.Config().browser_user_agent"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"\"\"\"\n",
|
|
" url (str): The url of the source (news website) to build. For example,\n",
|
|
" `https://www.cnn.com`.\n",
|
|
" dry (bool): If true, the source object will be constructed but not\n",
|
|
" downloaded or parsed.\n",
|
|
" only_homepage (bool): If true, the source object will only parse\n",
|
|
" the homepage of the source.\n",
|
|
" only_in_path (bool): If true, the source object will only\n",
|
|
" parse the articles that are in the same path as the source's\n",
|
|
" homepage. You can scrape a specific category this way.\n",
|
|
" Defaults to False.\n",
|
|
" input_html (str): The HTML of the source to parse. Use this to pass cached\n",
|
|
" HTML to the source object.\n",
|
|
" config (Configuration): A configuration object to use for the source.\n",
|
|
" kwargs: Any other keyword arguments to pass to the Source constructor.\n",
|
|
" If you omit the config object, you can add any configuration\n",
|
|
" options here.\n",
|
|
"\"\"\"\n",
|
|
"\n",
|
|
"url = \"https://www.lanacion.com.ar/deportes/\"\n",
|
|
"\n",
|
|
"newspaper_built = newspaper.build(url, only_in_path=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"newspaper_built.__dict__"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"newspaper_built.article_urls()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"\n",
|
|
"\n",
|
|
"url = \"https://www.lanacion.com.ar/\"\n",
|
|
"#url = \"https://www.lanacion.com.ar/deportes/\"\n",
|
|
"newspaper_built = newspaper.build(url)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"\"\"\"\n",
|
|
" url (str): The url of the source (news website) to build. For example,\n",
|
|
" `https://www.cnn.com`.\n",
|
|
" dry (bool): If true, the source object will be constructed but not\n",
|
|
" downloaded or parsed.\n",
|
|
" only_homepage (bool): If true, the source object will only parse\n",
|
|
" the homepage of the source.\n",
|
|
" only_in_path (bool): If true, the source object will only\n",
|
|
" parse the articles that are in the same path as the source's\n",
|
|
" homepage. You can scrape a specific category this way.\n",
|
|
" Defaults to False.\n",
|
|
" input_html (str): The HTML of the source to parse. Use this to pass cached\n",
|
|
" HTML to the source object.\n",
|
|
" config (Configuration): A configuration object to use for the source.\n",
|
|
" kwargs: Any other keyword arguments to pass to the Source constructor.\n",
|
|
" If you omit the config object, you can add any configuration\n",
|
|
" options here.\n",
|
|
"\"\"\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"cat = newspaper_built.categories[0]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"newspaper_built.categories_to_articles()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"newspaper_built.category_urls()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
" 'https://www.lanacion.com.ar/tema/futbol-argentino-tid57505/',\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"categories = newspaper_built.category_urls()\n",
|
|
"url_of_interest = \"https://www.lanacion.com.ar/sabado/todo-para-ellos-nid21042025/\"\n",
|
|
"\n",
|
|
"potential_categories = []\n",
|
|
"\n",
|
|
"for c in categories:\n",
|
|
" if (c in url_of_interest):\n",
|
|
" print(c, url_of_interest)\n",
|
|
" potential_categories.append(c)\n",
|
|
"\n",
|
|
"# Get longest length category"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"newspaper_built.article_urls()"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "matitos_urls",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.9"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|