Blog z pasji do IT

Post jest kontynuacją 3-ciej części serii.

W dzisiejszym poście zajmiemy się webscrapingiem stron www. Dzięki tej technice, oprócz przyjmowania czystego tekstu, nasz RAG będzie potrafił samodzielne pobierać informacje bezpośrednio z stron internetowych.

Czyszczenie danych

Na samym początku tworzymy naszą metodę która będzie pobierała treść strony www, następnie usuwała z niej wszelkie style i skrypty. Pozwoli to nam zaoszczędzić tokeny i pozbyć się całkowicie zbędnego tekstu.

def fetch_url_text(url: str) -> str:
    try:
        r = requests.get(url, timeout=10)
        r.raise_for_status()
    except Exception as e:
        print(f"Nie udało się pobrać {url}: {e}")
        return ""

    soup = BeautifulSoup(r.text, "html.parser")
    for s in soup(["script", "style", "noscript", "iframe"]):
        s.extract()
    text = soup.get_text(separator=" ")
    text = " ".join(text.split())
    return text

Aby nasza funkcja, potrafiła przyjmować większą ilość adresów url. Dodamy do niej wrapper który przyjmie ich listę i dla każdego wywoła powyższą funkcję.

def load_data_from_url(urls: List[str]) -> List[Dict]:
    data = []
    for idx, url in enumerate(urls):
        page_body = fetch_url_text(url)
        if page_body:
            data.append(
                {
                    "id": f"doc-{idx}-{url.replace('https://', '').replace('/', '_')}",
                    "doc": page_body,
                    "meta": {"source": url},
                }
            )
    return data

Z racji że w 1-szej części dodaliśmy metodę load_sample_data(), włożymy ją w funkcję opakowującą. W przypadku braku podania listy adresów url, zwróci nam ona przykładowe dane. W przeciwnym razie otrzymamy nam treść pobranych stron.

def load_docs(urls: Optional[List[str]] = None) -> List[Dict]:
    if urls:
        return load_data_from_url(urls)
    return load_sample_data()

Pełny kod:

import json
from typing import Dict, List, Optional

import requests
from bs4 import BeautifulSoup
from rich.console import Console
from rich.panel import Panel
from rich.table import Table


def fetch_url_text(url: str) -> str:
    """
    Fetches the page content from the provided URL and extracts plain text.

    Args:
        url (str): The URL of the page to fetch.

    Returns:
        str: The plain text content of the page, or an empty string if the content could not be retrieved.
    """
    try:
        r = requests.get(url, timeout=10)
        r.raise_for_status()
    except Exception as e:
        print(f"Nie udało się pobrać {url}: {e}")
        return ""

    soup = BeautifulSoup(r.text, "html.parser")
    for s in soup(["script", "style", "noscript", "iframe"]):
        s.extract()
    text = soup.get_text(separator=" ")
    text = " ".join(text.split())
    return text


def load_data_from_url(urls: List[str]) -> List[Dict]:
    """
    Fetches the content of pages from the provided list of URLs and returns a list of dictionaries in the following format:
    [{"id": ..., "doc": ..., "meta": {"source": ...}}, ...]

    Args:
        urls (List[str]): A list of URLs to fetch content from.

    Returns:
        List[Dict]: A list of dictionaries where each dictionary contains the following keys:
            - "id": A unique identifier for the document (e.g., a combination of URL and index).
            - "doc": The plain text content fetched from the URL.
            - "meta": A dictionary with metadata, including the "source" key that stores the URL.

    Example:
        [
            {"id": "doc-0-example_com", "doc": "Page content", "meta": {"source": "http://example.com"}},
            {"id": "doc-1-example_org", "doc": "Another page content", "meta": {"source": "http://example.org"}}
        ]
    """

    data = []
    for idx, url in enumerate(urls):
        page_body = fetch_url_text(url)
        if page_body:
            data.append(
                {
                    "id": f"doc-{idx}-{url.replace('https://', '').replace('/', '_')}",
                    "doc": page_body,
                    "meta": {"source": url},
                }
            )
    return data


def load_sample_data() -> List[Dict]:
    """
    Returns sample data in JSON format.

    This method provides a predefined set of sample documents, each containing an identifier (`id`),
    text content (`doc`), and metadata (`meta`) with the source of the document.

    Returns:
        List[Dict]: A list of dictionaries, each representing a sample document with the following keys:
            - "id": A unique identifier for the document.
            - "doc": The content of the document as a string.
            - "meta": A dictionary with metadata about the document, including the "source" key.

    Example:
        [
            {"id": "doc1", "doc": "Chroma is an engine for vector databases.", "meta": {"source": "notes"}},
            {"id": "doc2", "doc": "OpenAI embeddings allow text to be converted into vectors.", "meta": {"source": "blog"}}
        ]
    """
    json_data = """
    [
        {"id":"doc1", "doc":"Chroma is an engine for vector databases.", "meta":{"source":"notes"}},
        {"id":"doc2", "doc":"OpenAI embeddings allow text to be converted into vectors.", "meta":{"source":"blog"}},
        {"id":"doc3", "doc":"LangChain makes it easier to create applications based on LLMs.", "meta":{"source":"documentation"}},
        {"id":"doc4", "doc":"Vector databases enable efficient searching through large text data sets.", "meta":{"source":"article"}},
        {"id":"doc5", "doc":"RAG combines text generation with real-time information retrieval.", "meta":{"source":"presentation"}}
    ]
    """
    return json.loads(json_data)


def load_docs(urls: Optional[List[str]] = None) -> List[Dict]:
    """
    Loads documents. If URLs are provided, fetches their content; otherwise, returns sample data.

    If a list of URLs is provided, this method will attempt to fetch the content from each URL and return it in the
    form of a list of dictionaries. If no URLs are provided, it will return a predefined set of sample data.

    Args:
        urls (Optional[List[str]], optional): A list of URLs to fetch content from. Defaults to None.

    Returns:
        List[Dict]: A list of dictionaries, where each dictionary contains the following:
            - "id": A unique identifier for the document.
            - "doc": The content of the document (text from the URL or sample data).
            - "meta": A dictionary with metadata about the document, including the "source" key with the URL or source of sample data.

    Example:
        If URLs are provided:
        [
            {"id": "doc-0-example_com", "doc": "Page content", "meta": {"source": "http://example.com"}},
            {"id": "doc-1-example_org", "doc": "Another page content", "meta": {"source": "http://example.org"}}
        ]

        If no URLs are provided, returns sample data:
        [
            {"id": "doc1", "doc": "Chroma is an engine for vector databases.", "meta": {"source": "notes"}},
            {"id": "doc2", "doc": "OpenAI embeddings allow text to be converted into vectors.", "meta": {"source": "blog"}}
        ]
    """
    if urls:
        return load_data_from_url(urls)
    return load_sample_data()


docs = ["https://kamdev.pl", "http://blog.kamdev.pl"]
data = load_docs(docs)

# Wyświetlenie pobranych danych
console = Console()
table = Table(title="Zaimportowane strony", show_lines=True)
table.add_column("ID")
table.add_column("Body")
table.add_column("Tytuł")
for doc in data:
    table.add_row(doc["id"], doc["doc"][:80], doc["meta"].get("source", "brak źródła") )
console.print(table)

Po uruchomieniu powyższego programu otrzymujemy:

*Kolumna body została celowo skrócona dla zachowania czytelności.

Projekty AI

Budujemy własny Rag cz.4 - scrapowanie stron www

Czyszczenie danych

Pełny kod:

Kamil Mirończuk

Komentarze

Zostaw komentarz

Wyszukaj:

Kategorie:

Ostatnie posty: