Configuration API

`pretok.config.PretokConfig`

Bases: BaseModel

Root configuration for pretok.

Source code in src/pretok/config/schema.py

class PretokConfig(BaseModel):
    """Root configuration for pretok."""

    version: str = Field(
        default="1.0",
        description="Configuration schema version",
    )
    pipeline: PipelineConfig = Field(default_factory=PipelineConfig)
    detection: DetectionConfig = Field(default_factory=DetectionConfig)
    translation: TranslationConfig = Field(default_factory=TranslationConfig)
    cache: CacheConfig = Field(default_factory=CacheConfig)
    segment: SegmentConfig = Field(default_factory=SegmentConfig)
    models: ModelsConfig = Field(default_factory=ModelsConfig)

    def to_dict(self) -> dict[str, Any]:
        """Convert configuration to dictionary."""
        return self.model_dump()

`to_dict()`

Convert configuration to dictionary.

Source code in src/pretok/config/schema.py

def to_dict(self) -> dict[str, Any]:
    """Convert configuration to dictionary."""
    return self.model_dump()

`pretok.config.PipelineConfig`

Bases: BaseModel

Configuration for the core pipeline.

Source code in src/pretok/config/schema.py

class PipelineConfig(BaseModel):
    """Configuration for the core pipeline."""

    default_detector: str = Field(
        default="langdetect",
        description="Default language detector backend",
    )
    default_translator: str | None = Field(
        default=None,
        description="Default translator backend",
    )
    cache_enabled: bool = Field(
        default=True,
        description="Enable translation caching",
    )
    cache_backend: str = Field(
        default="memory",
        description="Cache backend to use",
    )
    confidence_threshold: float = Field(
        default=0.8,
        ge=0.0,
        le=1.0,
        description="Minimum confidence for language detection",
    )
    fallback_to_original: bool = Field(
        default=True,
        description="Return original text if translation fails",
    )
    strict_mode: bool = Field(
        default=False,
        description="Raise exceptions on failures instead of falling back",
    )

`pretok.config.DetectionConfig`

Bases: BaseModel

Configuration for language detection.

Source code in src/pretok/config/schema.py

class DetectionConfig(BaseModel):
    """Configuration for language detection."""

    fasttext: FastTextConfig = Field(default_factory=FastTextConfig)
    langdetect: LangDetectConfig = Field(default_factory=LangDetectConfig)
    composite: CompositeDetectorConfig = Field(default_factory=CompositeDetectorConfig)

`pretok.config.TranslationConfig`

Bases: BaseModel

Configuration for translation engines.

Source code in src/pretok/config/schema.py

class TranslationConfig(BaseModel):
    """Configuration for translation engines."""

    llm: LLMTranslatorConfig = Field(default_factory=LLMTranslatorConfig)
    huggingface: HuggingFaceTranslatorConfig = Field(default_factory=HuggingFaceTranslatorConfig)
    google: GoogleTranslatorConfig = Field(default_factory=GoogleTranslatorConfig)
    deepl: DeepLTranslatorConfig = Field(default_factory=DeepLTranslatorConfig)

`pretok.config.LLMTranslatorConfig`

Bases: BaseModel

Configuration for LLM-based translator (OpenAI-compatible APIs).

Source code in src/pretok/config/schema.py

class LLMTranslatorConfig(BaseModel):
    """Configuration for LLM-based translator (OpenAI-compatible APIs)."""

    base_url: str | None = Field(
        default=None,
        description="API base URL (OpenAI, OpenRouter, Ollama, vLLM, etc.)",
    )
    api_key: str | None = Field(
        default=None,
        description="API key (if not using env var)",
    )
    api_key_env: str = Field(
        default="OPENAI_API_KEY",
        description="Environment variable name for API key",
    )
    model: str = Field(
        default="gpt-4o-mini",
        description="Model name to use",
    )
    temperature: float = Field(
        default=0.3,
        ge=0.0,
        le=2.0,
        description="Sampling temperature",
    )
    max_retries: int = Field(
        default=3,
        ge=0,
        description="Maximum retry attempts",
    )
    retry_delay: float = Field(
        default=1.0,
        ge=0.0,
        description="Delay between retries in seconds",
    )
    system_prompt: str | None = Field(
        default=None,
        description="Custom system prompt for translation",
    )
    user_prompt_template: str | None = Field(
        default=None,
        description="Custom user prompt template with {text}, {source}, {target}",
    )
    max_tokens: int | None = Field(
        default=None,
        ge=1,
        description="Maximum tokens for API response. If set, overrides max_tokens_multiplier. "
        "Recommended for thinking models (qwen3, DeepSeek-R1) that need more tokens.",
    )
    max_tokens_multiplier: int = Field(
        default=4,
        ge=1,
        description="Multiplier for calculating max_tokens from input length. "
        "max_tokens = len(text) * multiplier. Ignored if max_tokens is set.",
    )

    def get_api_key(self) -> str | None:
        """Get API key from config or environment."""
        if self.api_key:
            return self.api_key
        return os.environ.get(self.api_key_env)

`get_api_key()`

Get API key from config or environment.

Source code in src/pretok/config/schema.py

def get_api_key(self) -> str | None:
    """Get API key from config or environment."""
    if self.api_key:
        return self.api_key
    return os.environ.get(self.api_key_env)

`pretok.config.CacheConfig`

Bases: BaseModel

Configuration for caching.

Source code in src/pretok/config/schema.py

class CacheConfig(BaseModel):
    """Configuration for caching."""

    memory: MemoryCacheConfig = Field(default_factory=MemoryCacheConfig)
    redis: RedisCacheConfig = Field(default_factory=RedisCacheConfig)
    sqlite: SQLiteCacheConfig = Field(default_factory=SQLiteCacheConfig)

`pretok.config.SegmentConfig`

Bases: BaseModel

Configuration for segment processing.

Source code in src/pretok/config/schema.py

class SegmentConfig(BaseModel):
    """Configuration for segment processing."""

    preserve_code_blocks: bool = Field(
        default=True,
        description="Preserve code blocks without translation",
    )
    translate_code_comments: bool = Field(
        default=False,
        description="Translate comments within code blocks",
    )
    translate_json_strings: bool = Field(
        default=False,
        description="Translate string values in JSON",
    )
    custom_markers: list[CustomMarkerConfig] = Field(
        default_factory=list,
        description="Custom segment markers",
    )
    format_hint: str | None = Field(
        default=None,
        description="Force specific prompt format: chatml, llama, alpaca",
    )

`pretok.config.load_config(path=None, *, config_dict=None, auto_discover=True)`

Load and validate pretok configuration.

Configuration is loaded with the following hierarchy (later overrides earlier): 1. Built-in defaults 2. Configuration file (if found/specified) 3. Runtime config_dict overrides

Parameters:

Name	Type	Description	Default
`path`	`str \| Path \| None`	Path to configuration file (optional)	`None`
`config_dict`	`dict[str, Any] \| None`	Runtime configuration overrides	`None`
`auto_discover`	`bool`	Whether to auto-discover config file if path not specified	`True`

Returns:

Type	Description
`PretokConfig`	Validated PretokConfig instance

Raises:

Type	Description
`ConfigurationError`	If configuration is invalid

Source code in src/pretok/config/loader.py

def load_config(
    path: str | Path | None = None,
    *,
    config_dict: dict[str, Any] | None = None,
    auto_discover: bool = True,
) -> PretokConfig:
    """Load and validate pretok configuration.

    Configuration is loaded with the following hierarchy (later overrides earlier):
    1. Built-in defaults
    2. Configuration file (if found/specified)
    3. Runtime config_dict overrides

    Args:
        path: Path to configuration file (optional)
        config_dict: Runtime configuration overrides
        auto_discover: Whether to auto-discover config file if path not specified

    Returns:
        Validated PretokConfig instance

    Raises:
        ConfigurationError: If configuration is invalid
    """
    file_config: dict[str, Any] = {}

    # Load from file
    if path is not None:
        file_config = load_config_file(path)
    elif auto_discover:
        found_path = find_config_file()
        if found_path:
            file_config = load_config_file(found_path)

    # Merge with runtime overrides
    if config_dict:
        file_config = _deep_merge(file_config, config_dict)

    # Validate and create config
    try:
        return PretokConfig(**file_config)
    except ValidationError as e:
        error_list = e.errors()
        messages = [f"  - {err['loc']}: {err['msg']}" for err in error_list]
        error_dicts: list[dict[str, Any]] = [dict(err) for err in error_list]
        raise ConfigurationError(
            "Invalid configuration:\n" + "\n".join(messages),
            errors=error_dicts,
        ) from e

`pretok.config.ConfigurationError`

Bases: Exception

Raised when configuration is invalid.

Source code in src/pretok/config/loader.py

class ConfigurationError(Exception):
    """Raised when configuration is invalid."""

    def __init__(self, message: str, errors: list[dict[str, Any]] | None = None) -> None:
        super().__init__(message)
        self.errors = errors or []