matitos_news/utils/Newspapers.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "url = \"https://onlinenewspapers.com/index.shtml\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'newspaper/0.9.3.1'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"\"\"\n",
    "import newspaper\n",
    "\n",
    "newspaper.Config().__dict__\n",
    "\n",
    " 'requests_params': {'timeout': 7,\n",
    "  'proxies': {},\n",
    "  'headers': {'User-Agent': 'newspaper/0.9.3.1'}},\n",
    "\"\"\"\n",
    "import newspaper\n",
    "newspaper.Config().browser_user_agent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "        url (str): The url of the source (news website) to build. For example,\n",
    "            `https://www.cnn.com`.\n",
    "        dry (bool): If true, the source object will be constructed but not\n",
    "            downloaded or parsed.\n",
    "        only_homepage (bool): If true, the source object will only parse\n",
    "            the homepage of the source.\n",
    "        only_in_path (bool): If true, the source object will only\n",
    "            parse the articles that are in the same path as the source's\n",
    "            homepage. You can scrape a specific category this way.\n",
    "            Defaults to False.\n",
    "        input_html (str): The HTML of the source to parse. Use this to pass cached\n",
    "            HTML to the source object.\n",
    "        config (Configuration): A configuration object to use for the source.\n",
    "        kwargs: Any other keyword arguments to pass to the Source constructor.\n",
    "            If you omit the config object, you can add any configuration\n",
    "            options here.\n",
    "\"\"\"\n",
    "\n",
    "url = \"https://www.lanacion.com.ar/deportes/\"\n",
    "\n",
    "newspaper_built = newspaper.build(url, only_in_path=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "newspaper_built.__dict__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "newspaper_built.article_urls()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "url = \"https://www.lanacion.com.ar/\"\n",
    "#url = \"https://www.lanacion.com.ar/deportes/\"\n",
    "newspaper_built = newspaper.build(url)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "        url (str): The url of the source (news website) to build. For example,\n",
    "            `https://www.cnn.com`.\n",
    "        dry (bool): If true, the source object will be constructed but not\n",
    "            downloaded or parsed.\n",
    "        only_homepage (bool): If true, the source object will only parse\n",
    "            the homepage of the source.\n",
    "        only_in_path (bool): If true, the source object will only\n",
    "            parse the articles that are in the same path as the source's\n",
    "            homepage. You can scrape a specific category this way.\n",
    "            Defaults to False.\n",
    "        input_html (str): The HTML of the source to parse. Use this to pass cached\n",
    "            HTML to the source object.\n",
    "        config (Configuration): A configuration object to use for the source.\n",
    "        kwargs: Any other keyword arguments to pass to the Source constructor.\n",
    "            If you omit the config object, you can add any configuration\n",
    "            options here.\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat = newspaper_built.categories[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "newspaper_built.categories_to_articles()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "newspaper_built.category_urls()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    " 'https://www.lanacion.com.ar/tema/futbol-argentino-tid57505/',\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "categories = newspaper_built.category_urls()\n",
    "url_of_interest = \"https://www.lanacion.com.ar/sabado/todo-para-ellos-nid21042025/\"\n",
    "\n",
    "potential_categories = []\n",
    "\n",
    "for c in categories:\n",
    "    if (c in url_of_interest):\n",
    "        print(c, url_of_interest)\n",
    "        potential_categories.append(c)\n",
    "\n",
    "# Get longest length category"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "newspaper_built.article_urls()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "matitos_urls",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}