Working fetch search, refactoring DB towards source search

This commit is contained in:
Luciano Gervasoni
2025-03-20 11:42:33 +01:00
parent 83f76232b2
commit 05e17266f1
14 changed files with 558 additions and 120 deletions

View File

@@ -2,15 +2,80 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"import newspaper\n",
"url = \"http://www.missingkids.org/poster/NCMC/2045193/1\"\n",
"#url = \"https://www.missingkids.org/new-poster/NCMC/2045193/1\"\n",
"# !pip install git+https://github.com/tasos-py/Search-Engines-Scraper.git\n",
"import search_engines\n",
"\n",
"art = newspaper.article(url)"
"engine = search_engines.Bing()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Searching Bing \n",
" \r"
]
}
],
"source": [
"results = engine.search('news: \"child abuse\"', pages=2)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"engine = search_engines.search_engines_dict[\"brave\"]()"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Searching Brave \n",
" \r"
]
}
],
"source": [
"query = 'news: child abuse'\n",
"r = engine.search(query, pages=2)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'_results': []}"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"r.__dict__"
]
},
{
@@ -18,8 +83,57 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"newspaper.exceptions.ArticleBinaryDataException"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"art.__dict__"
"import newspaper\n",
"newspaper.ArticleBinaryDataException"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"'''\n",
"import newspaper\n",
"\n",
"url = 'https://www.missingkids.org/poster/USVA/VA25-0820/1'\n",
"art_1 = newspaper.article(url)\n",
"url = 'https://www.missingkids.org/poster/NCMC/2045193/1'\n",
"art_2 = newspaper.article(url)\n",
"'''"
]
},
{
@@ -44,15 +158,8 @@
"l = client.list()\n",
"list_models = [m.get(\"model\") for m in l.model_dump().get(\"models\")]\n",
"\n",
"list_models"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(list_models)\n",
"\n",
"for m in list_models:\n",
" context_key = [ k for k in client.show(m).model_dump().get(\"modelinfo\").keys() if \"context_length\" in k]\n",
" if (len(context_key) != 1):\n",