{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "url = \"https://onlinenewspapers.com/index.shtml\"" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'newspaper/0.9.3.1'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"\"\"\n", "import newspaper\n", "\n", "newspaper.Config().__dict__\n", "\n", " 'requests_params': {'timeout': 7,\n", " 'proxies': {},\n", " 'headers': {'User-Agent': 'newspaper/0.9.3.1'}},\n", "\"\"\"\n", "import newspaper\n", "newspaper.Config().browser_user_agent" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\"\"\"\n", " url (str): The url of the source (news website) to build. For example,\n", " `https://www.cnn.com`.\n", " dry (bool): If true, the source object will be constructed but not\n", " downloaded or parsed.\n", " only_homepage (bool): If true, the source object will only parse\n", " the homepage of the source.\n", " only_in_path (bool): If true, the source object will only\n", " parse the articles that are in the same path as the source's\n", " homepage. You can scrape a specific category this way.\n", " Defaults to False.\n", " input_html (str): The HTML of the source to parse. Use this to pass cached\n", " HTML to the source object.\n", " config (Configuration): A configuration object to use for the source.\n", " kwargs: Any other keyword arguments to pass to the Source constructor.\n", " If you omit the config object, you can add any configuration\n", " options here.\n", "\"\"\"\n", "\n", "url = \"https://www.lanacion.com.ar/deportes/\"\n", "\n", "newspaper_built = newspaper.build(url, only_in_path=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "newspaper_built.__dict__" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "newspaper_built.article_urls()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "\n", "url = \"https://www.lanacion.com.ar/\"\n", "#url = \"https://www.lanacion.com.ar/deportes/\"\n", "newspaper_built = newspaper.build(url)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\"\"\"\n", " url (str): The url of the source (news website) to build. For example,\n", " `https://www.cnn.com`.\n", " dry (bool): If true, the source object will be constructed but not\n", " downloaded or parsed.\n", " only_homepage (bool): If true, the source object will only parse\n", " the homepage of the source.\n", " only_in_path (bool): If true, the source object will only\n", " parse the articles that are in the same path as the source's\n", " homepage. You can scrape a specific category this way.\n", " Defaults to False.\n", " input_html (str): The HTML of the source to parse. Use this to pass cached\n", " HTML to the source object.\n", " config (Configuration): A configuration object to use for the source.\n", " kwargs: Any other keyword arguments to pass to the Source constructor.\n", " If you omit the config object, you can add any configuration\n", " options here.\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cat = newspaper_built.categories[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "newspaper_built.categories_to_articles()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "newspaper_built.category_urls()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ " 'https://www.lanacion.com.ar/tema/futbol-argentino-tid57505/',\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "categories = newspaper_built.category_urls()\n", "url_of_interest = \"https://www.lanacion.com.ar/sabado/todo-para-ellos-nid21042025/\"\n", "\n", "potential_categories = []\n", "\n", "for c in categories:\n", " if (c in url_of_interest):\n", " print(c, url_of_interest)\n", " potential_categories.append(c)\n", "\n", "# Get longest length category" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "newspaper_built.article_urls()" ] } ], "metadata": { "kernelspec": { "display_name": "matitos_urls", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.9" } }, "nbformat": 4, "nbformat_minor": 2 }