Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2,560 changes: 1,783 additions & 777 deletions poetry.lock

Large diffs are not rendered by default.

11 changes: 0 additions & 11 deletions protollm/rags/configs/chroma.env
Original file line number Diff line number Diff line change
@@ -1,11 +0,0 @@
# Chroma DB Settings
CHROMA_HOST='any'
CHROMA_PORT=any
ALLOW_RESET=False

# Documents collection's settings
COLLECTION_NAME='any'
COLLECTION_NAMES_FOR_ADVANCE=['any']
EMBEDDING_NAME='any'
EMBEDDING_HOST='any'
DISTANCE_FN='cosine'
36 changes: 17 additions & 19 deletions protollm/rags/configs/docs_processing_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,23 +10,21 @@ loader:


splitter:
splitter_name: 'hierarchical_merger'
splitter_params:
chunk_size: 510
chunk_overlap: 0
separators:
- '\n\n'
- '\n'
- '. '
- ', '
- '.'
- ','
- ' '
- ''
keep_separator: False
add_start_index: False
strip_whitespace: True
apply_chunks_merge: True
- splitter_name: 'hierarchical_merger'
splitter_params:
chunk_size: 510
chunk_overlap: 0
separators:
- '\n\n'
- '\n'
- '. '
- ', '
- '.'
- ','
- ' '
- ''
keep_separator: False
add_start_index: False
strip_whitespace: True
apply_chunks_merge: True

#tokenizer: 'hf-internal-testing/llama-tokenizer'
tokenizer: 'any'
5 changes: 0 additions & 5 deletions protollm/rags/configs/elastic.env
Original file line number Diff line number Diff line change
@@ -1,8 +1,3 @@
ES_HOST=any
ES_PORT=any
ES_USER=any
ES_PASSWORD=any

ES_INDEX_MAPPINGS: dict = json.loads(Path(CONFIG_PATH, 'index_mappings.json').read_text(encoding="utf-8"))
ES_INDEX_SETTINGS: dict = json.loads(Path(CONFIG_PATH, 'index_settings.json').read_text(encoding="utf-8"))
es_query_template: dict = json.loads(Path(CONFIG_PATH, 'query_template.json').read_text(encoding="utf-8"))
Expand Down
15 changes: 5 additions & 10 deletions protollm/rags/settings/chroma_settings.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from os.path import dirname
from pathlib import Path
from pydantic_settings import BaseSettings, SettingsConfigDict
from typing import Optional
from pydantic_settings import BaseSettings


class ChromaSettings(BaseSettings):
# Chroma DB settings
chroma_host: str = 'any'
chroma_host: str = '127.0.0.1'
chroma_port: int = 8888
allow_reset: bool = False

Expand All @@ -17,14 +18,8 @@ class ChromaSettings(BaseSettings):
distance_fn: str = 'cosine'

# Documents' processing settings
docs_processing_config: str = str(Path(dirname(dirname(__file__)), '/config_files/', 'docs_processing_config.yaml'))
docs_collection_path: str = str(Path(dirname(dirname(dirname(__file__))), '/docs/', 'example.docx'))

model_config = SettingsConfigDict(
env_file=Path(dirname(dirname(__file__)), '/config_files/', 'chroma.env'),
env_file_encoding='utf-8',
extra='ignore',
)
docs_processing_config: Optional[str] = None
docs_collection_path: str = str(Path(dirname(dirname(__file__))) / 'docs' / 'example.docx')
Copy link
Contributor

@nicl-nno nicl-nno Feb 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

А dirname тут критичны, нельзя от корня проекта написать путь (тут и ниже)? Тяжело читается. Да и нужно ли вообще значения по умолчанию с example.docx?



settings = ChromaSettings()
6 changes: 2 additions & 4 deletions protollm/rags/settings/es_settings.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import json
from os.path import dirname
from pathlib import Path

from pydantic import computed_field
from pydantic_settings import BaseSettings, SettingsConfigDict


CONFIG_PATH = Path(dirname(dirname(__file__)), '/stores/elasticsearch/configs')
CONFIG_PATH = Path(__file__).parent.parent / 'stores/elasticsearch/configs'


class ElasticsearchSettings(BaseSettings):
Expand All @@ -30,7 +28,7 @@ def es_url(self) -> str:
content_field: str = 'paragraph'

model_config = SettingsConfigDict(
env_file=Path(Path(__file__).parent.parent, '/configs/elastic.env'),
env_file=Path(__file__).parent.parent / 'configs' / 'elastic.env',
env_file_encoding='utf-8',
extra='ignore',
)
Expand Down
6 changes: 4 additions & 2 deletions protollm/rags/settings/pipeline_settings.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import warnings
from copy import deepcopy
from typing import TextIO, Optional, Any, Type
from typing import Optional, Any, Type

import inspect

Expand All @@ -18,7 +18,7 @@ def _get_params_for_transformer(params: dict[str, Any],
transformer_class: Type[BaseDocumentTransformer]) -> dict[str, Any]:
text_splitter_params = inspect.signature(TextSplitter.__init__).parameters.keys()
transformer_params = {key: value for key, value in params.items()
if key in inspect.signature(transformer_class.__init__).parameters.keys()
if key in inspect.signature(transformer_class.__init__).parameters
or key in text_splitter_params}
return transformer_params

Expand Down Expand Up @@ -54,6 +54,8 @@ def loader_params(self) -> dict[str, Any]:

@loader_params.setter
def loader_params(self, config: Optional[ConfigFile]):
if config is None:
return
loader_params = deepcopy(config.loader.parsing_params)
loader_params['file_path'] = config.loader.file_path
self._loader_params = loader_params
Expand Down
13 changes: 11 additions & 2 deletions protollm/rags/stores/chroma/chroma_loader.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from pathlib import Path
from typing import Optional
import logging

Expand All @@ -8,12 +9,14 @@
from protollm.rags.pipeline.etl_pipeline import DocsExtractPipeline
from protollm.rags.settings.pipeline_settings import PipelineSettings
from protollm.rags.settings.chroma_settings import ChromaSettings, settings as default_settings
from langchain_core.embeddings import Embeddings

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


def load_documents_to_chroma_db(settings: Optional[ChromaSettings] = None,
embedding_function: Optional[Embeddings] = None,
processing_batch_size: int = 100,
loading_batch_size: int = 32,
**kwargs) -> None:
Expand All @@ -26,10 +29,16 @@ def load_documents_to_chroma_db(settings: Optional[ChromaSettings] = None,
f' loading_batch_size: {loading_batch_size}'
)

pipeline_settings = PipelineSettings.config_from_file(settings.docs_processing_config)
if settings.docs_processing_config is None:
pipeline_settings = PipelineSettings.config_from_file(str(Path(__file__).parent.parent.parent / 'configs' / 'docs_processing_config.yaml'))
else:
pipeline_settings = PipelineSettings.config_from_file(settings.docs_processing_config)

if embedding_function is None:
embedding_function = HuggingFaceHubEmbeddings(model=settings.embedding_host)

store = Chroma(collection_name=settings.collection_name,
embedding_function=HuggingFaceHubEmbeddings(model=settings.embedding_host, huggingfacehub_api_token='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'),
embedding_function=embedding_function,
client=chromadb.HttpClient(host=settings.chroma_host, port=settings.chroma_port))

# Documents loading and processing
Expand Down
2 changes: 1 addition & 1 deletion protollm/rags/stores/elasticsearch/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pathlib import Path


CONFIG_PATH = Path(Path(__file__).parent, 'configs')
CONFIG_PATH = Path(__file__).parent / 'configs'


class ElasticsearchSettings:
Expand Down
8 changes: 4 additions & 4 deletions protollm/raw_data_processing/docs_parsers/loaders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from doc_loader import WordDocumentLoader
from directory_loader import RecursiveDirectoryLoader
from pdf_loader import PDFLoader
from zip_loader import ZipLoader
from protollm.raw_data_processing.docs_parsers.loaders.doc_loader import WordDocumentLoader
from protollm.raw_data_processing.docs_parsers.loaders.directory_loader import RecursiveDirectoryLoader
from protollm.raw_data_processing.docs_parsers.loaders.pdf_loader import PDFLoader
from protollm.raw_data_processing.docs_parsers.loaders.zip_loader import ZipLoader
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
from pathlib import Path
from typing import Iterator, Union, Any, Optional, Sequence
from typing import Any, Iterator, Optional, Sequence, Union

from langchain_community.document_loaders.directory import _is_visible
from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document
from tqdm import tqdm

from protollm.raw_data_processing.docs_parsers.parsers import ParsingScheme, DocType, BaseParser
from protollm.raw_data_processing.docs_parsers.utils.logger import ParsingLogger
from protollm.raw_data_processing.docs_parsers.loaders.pdf_loader import PDFLoader
from protollm.raw_data_processing.docs_parsers.utils.utilities import correct_path_encoding
from protollm.raw_data_processing.docs_parsers.loaders.doc_loader import WordDocumentLoader
from protollm.raw_data_processing.docs_parsers.loaders.pdf_loader import PDFLoader
from protollm.raw_data_processing.docs_parsers.loaders.zip_loader import ZipLoader
from protollm.raw_data_processing.docs_parsers.parsers import BaseParser, DocType, ParsingScheme
from protollm.raw_data_processing.docs_parsers.utils.logger import ParsingLogger
from protollm.raw_data_processing.docs_parsers.utils.utilities import correct_path_encoding


class RecursiveDirectoryLoader(BaseLoader):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@


def _get_omml2mml_transformation() -> etree.XSLT:
omml2mml_file = Path(Path(__file__).parent, "xsl", "omml2mml", "OMML2MML.XSL")
omml2mml_file = Path(__file__).parent / "xsl/omml2mml/OMML2MML.XSL"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

А чем такой вариант лучше чем через Path?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

На linux почему-то такой вариант не работал и выдавал лишь относительный путь

omml2mml = etree.XSLT(etree.parse(omml2mml_file))
return omml2mml


def _get_mml2tex_transformation() -> etree.XSLT:
mml2tex_file = Path(Path(__file__).parent, "xsl", "mml2tex", "mmltex.xsl")
mml2tex_file = Path(__file__).parent / "xsl/mml2tex/mmltex.xsl"
mml2tex = etree.XSLT(etree.parse(mml2tex_file))

return mml2tex
4 changes: 2 additions & 2 deletions protollm/raw_data_processing/docs_transformers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from chunk_merger import ChunkMerger
from recursive_splitter import RecursiveSplitter
from .chunk_merger import ChunkMerger
from .recursive_splitter import RecursiveSplitter
10 changes: 9 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,15 @@ name = "ProtoLLM"
version = "0.1.0"
description = ""
authors = ["aimclub"]
include = [
"protollm/rags/configs/*.yaml",
"protollm/rags/configs/*.env",
"protollm/rags/docs/*.docx"
]


[tool.poetry.dependencies]
python = "^3.10"
python = ">=3.10,<3.13"
aioredis = "^2.0.1"
pydantic = "^2.7.4"
celery = "^5.4.0"
Expand All @@ -24,6 +29,9 @@ langchain = "^0.3.4"
httpx = "^0.27.0"
openai = "^1.42.0"
tornado = "^6.4.1"
chardet = "^5.2.0"
ftfy = "^6.3.1"
spacy = "^3.8.4"

[tool.poetry.group.dev.dependencies]
pytest = "^8.2.2"
Expand Down