legal contact rss
 

Aufgabenstellung

  • Ein lokales LLM als KI ohne Verbindung ins Internet
  • Einlesen (FineTuning) von persönlichen Dokumenten vom NAS
  • Nutzung dieser Informationen im ChatBot

Um die Nvidia zu nutzen kommt aktuell nur eine Windowsmaschine in Frage. Dank wsl kann aber hier auf Ubuntu gesetzt werden, so dass ein favorisiertes Betriebssystem genutzt werden kann.

Es gibt ein komplettes Howto unter https://dev.to/docteurrs/installing-privategpt-on-wsl-with-gpu-support-1m2a welches alle Schritte erläutert. 

Die eigentliche Quelle ist: git clone https://github.com/zylon-ai/private-gpt

Die einzigen Anpassungen die (bei mir) nötig waren:

sudo apt install nvidia-cuda-toolkit

 CMAKE_ARGS='-DLLAMA_CUBLAS=OFF -DGGML_CUDA=ON' poetry run pip install --force-reinstall --no-cache-dir llama-cpp-python

oetry install --extras ui --extras llms-llama-cpp --extras vector-stores-qdrant --extras embeddings-huggingface


Danach kann mit:

make ingest /[Quellverzeichnis]/ -- --watch --log-file ./ingest.log ein Quellverzeichnis "überwacht" werden und entsprechend die Dokumente dort eingelesen werden.

Start

make run

und http://127.0.0.1:8001

Office Support

Da im Original nicht alle Officedateien unterstützt werden, habe ich ein wenig Angepasst.

  • poetry add python-pptx
  • poetry add openpyxl


/private-gpt/private_gpt/components/ingest/ingest_helper.py

import logging

from pathlib import Path

from typing import List, Union


from openpyxl import load_workbook

from openpyxl.utils.exceptions import InvalidFileException


from llama_index.core.readers import StringIterableReader

from llama_index.core.readers.base import BaseReader

from llama_index.core.readers.json import JSONReader

from llama_index.core.schema import Document


logger = logging.getLogger(__name__)


# Inspired by the `llama_index.core.readers.file.base` module

def _try_loading_included_file_formats() -> dict[str, type[BaseReader]]:

    try:

        from llama_index.readers.file.docs import DocxReader, PDFReader  # type: ignore

        from llama_index.readers.file.slides import PptxReader  # type: ignore

        from llama_index.readers.file.tabular import PandasCSVReader  # type: ignore

    except ImportError as e:

        raise ImportError("`llama-index-readers-file` package not found") from e


    return {

        ".pdf": PDFReader,

        ".docx": DocxReader,

        ".pptx": PptxReader,

        ".csv": PandasCSVReader,

    }



# Patching the default file reader to support other file types

FILE_READER_CLS = _try_loading_included_file_formats()

FILE_READER_CLS.update(

    {

        ".json": JSONReader,

    }

)



class IngestionHelper:

    """Helper class to transform a file or text into a list of documents."""


    @staticmethod

    def transform_file_into_documents(

        file_name: str, file_data: Union[str, Path]

    ) -> List[Document]:

        # Handle raw text

        if isinstance(file_data, str) and "\n" in file_data:

            logger.info("Detected raw text input; processing as text content.")

            return IngestionHelper._handle_raw_text(file_data)


        # Handle file paths

        if isinstance(file_data, (str, Path)):

            try:

                file_data_path = Path(file_data).resolve(strict=True)

                return IngestionHelper._load_file_to_documents(file_name, file_data_path)

            except FileNotFoundError:

                logger.error(f"File not found: {file_data}")

            except OSError as e:

                logger.error(f"Error accessing file: {file_data} - {e}")


        return []


    @staticmethod

    def _handle_raw_text(raw_text: str) -> List[Document]:

        """Handle cases where the input is raw text instead of a file."""

        string_reader = StringIterableReader()

        try:

            return string_reader.load_data([raw_text])

        except Exception as e:

            logger.error(f"Error processing raw text: {e}")

            return []


    @staticmethod

    def _load_file_to_documents(file_name: str, file_data: Path) -> List[Document]:

        logger.debug("Transforming file_name=%s into documents", file_name)


        if not file_data.is_file():

            logger.error(f"Path is not a file: {file_data}")

            return []


        extension = file_data.suffix.lower()

        reader_cls = FILE_READER_CLS.get(extension)

        if reader_cls is None:

            logger.debug(

                "No reader found for extension=%s, using default string reader",

                extension,

            )

            try:

                string_reader = StringIterableReader()

                return string_reader.load_data([file_data.read_text(encoding="utf-8")])

            except Exception as e:

                logger.error(f"Failed to read text file {file_data}: {e}")

                return []


        logger.debug("Specific reader found for extension=%s", extension)

        try:

            documents = reader_cls().load_data(file_data)

        except Exception as e:

            logger.error(f"Error reading file {file_data} with extension {extension}: {e}")

            documents = []


        # Sanitize NUL bytes in text which can't be stored in databases

        for i in range(len(documents)):

            documents[i].text = documents[i].text.replace("\u0000", "")


        return documents


    @staticmethod

    def _exclude_metadata(documents: List[Document]) -> None:

        logger.debug("Excluding metadata from count=%s documents", len(documents))

        for document in documents:

            document.metadata["doc_id"] = document.doc_id

            # We don't want the Embeddings search to receive this metadata

            document.excluded_embed_metadata_keys = ["doc_id"]

            # We don't want the LLM to receive these metadata in the context