Skip to content

Segments API

pretok.segment.Segment dataclass

A segment of text within a prompt.

Segments represent distinct parts of a prompt that may need different handling during translation.

Attributes:

Name Type Description
type SegmentType

The type of segment

content str

The text content of the segment

start int

Starting character index in original text

end int

Ending character index in original text

language str | None

Detected language code (if applicable)

metadata dict[str, Any]

Additional segment metadata

translatable bool

Whether this segment should be translated

Source code in src/pretok/segment/types.py
@dataclass
class Segment:
    """A segment of text within a prompt.

    Segments represent distinct parts of a prompt that may need
    different handling during translation.

    Attributes:
        type: The type of segment
        content: The text content of the segment
        start: Starting character index in original text
        end: Ending character index in original text
        language: Detected language code (if applicable)
        metadata: Additional segment metadata
        translatable: Whether this segment should be translated
    """

    type: SegmentType
    content: str
    start: int
    end: int
    language: str | None = None
    metadata: dict[str, Any] = field(default_factory=dict)
    translatable: bool = True

    def __post_init__(self) -> None:
        """Set default translatability based on type."""
        # These types are never translated
        non_translatable = {
            SegmentType.CODE,
            SegmentType.ROLE_MARKER,
            SegmentType.CONTROL_TOKEN,
            SegmentType.DELIMITER,
            SegmentType.WHITESPACE,
        }
        if self.type in non_translatable:
            self.translatable = False

    def __len__(self) -> int:
        """Return length of content."""
        return len(self.content)

    @property
    def span(self) -> tuple[int, int]:
        """Return (start, end) tuple."""
        return (self.start, self.end)

    def with_content(self, new_content: str) -> Segment:
        """Create new segment with different content.

        Args:
            new_content: New content string

        Returns:
            New Segment with updated content
        """
        return Segment(
            type=self.type,
            content=new_content,
            start=self.start,
            end=self.end,
            language=self.language,
            metadata=self.metadata.copy(),
            translatable=self.translatable,
        )

    def with_language(self, language: str) -> Segment:
        """Create new segment with detected language.

        Args:
            language: ISO 639-1 language code

        Returns:
            New Segment with language set
        """
        return Segment(
            type=self.type,
            content=self.content,
            start=self.start,
            end=self.end,
            language=language,
            metadata=self.metadata.copy(),
            translatable=self.translatable,
        )

span property

Return (start, end) tuple.

__len__()

Return length of content.

Source code in src/pretok/segment/types.py
def __len__(self) -> int:
    """Return length of content."""
    return len(self.content)

__post_init__()

Set default translatability based on type.

Source code in src/pretok/segment/types.py
def __post_init__(self) -> None:
    """Set default translatability based on type."""
    # These types are never translated
    non_translatable = {
        SegmentType.CODE,
        SegmentType.ROLE_MARKER,
        SegmentType.CONTROL_TOKEN,
        SegmentType.DELIMITER,
        SegmentType.WHITESPACE,
    }
    if self.type in non_translatable:
        self.translatable = False

with_content(new_content)

Create new segment with different content.

Parameters:

Name Type Description Default
new_content str

New content string

required

Returns:

Type Description
Segment

New Segment with updated content

Source code in src/pretok/segment/types.py
def with_content(self, new_content: str) -> Segment:
    """Create new segment with different content.

    Args:
        new_content: New content string

    Returns:
        New Segment with updated content
    """
    return Segment(
        type=self.type,
        content=new_content,
        start=self.start,
        end=self.end,
        language=self.language,
        metadata=self.metadata.copy(),
        translatable=self.translatable,
    )

with_language(language)

Create new segment with detected language.

Parameters:

Name Type Description Default
language str

ISO 639-1 language code

required

Returns:

Type Description
Segment

New Segment with language set

Source code in src/pretok/segment/types.py
def with_language(self, language: str) -> Segment:
    """Create new segment with detected language.

    Args:
        language: ISO 639-1 language code

    Returns:
        New Segment with language set
    """
    return Segment(
        type=self.type,
        content=self.content,
        start=self.start,
        end=self.end,
        language=language,
        metadata=self.metadata.copy(),
        translatable=self.translatable,
    )

pretok.segment.SegmentType

Bases: Enum

Types of segments in a prompt.

Different segment types have different handling rules: - TEXT: Regular text content that should be translated - CODE: Code blocks that should NOT be translated (except comments) - JSON: JSON content where only string values may be translated - ROLE_MARKER: Chat format markers (e.g., <|user|>, [INST]) - CONTROL_TOKEN: Special tokens (e.g., <|endoftext|>, ) - DELIMITER: Format delimiters (e.g., ###, ---) - WHITESPACE: Significant whitespace that should be preserved - COMMENT: Code comments (may be translated based on config)

Source code in src/pretok/segment/types.py
class SegmentType(Enum):
    """Types of segments in a prompt.

    Different segment types have different handling rules:
    - TEXT: Regular text content that should be translated
    - CODE: Code blocks that should NOT be translated (except comments)
    - JSON: JSON content where only string values may be translated
    - ROLE_MARKER: Chat format markers (e.g., <|user|>, [INST])
    - CONTROL_TOKEN: Special tokens (e.g., <|endoftext|>, <s>)
    - DELIMITER: Format delimiters (e.g., ###, ---)
    - WHITESPACE: Significant whitespace that should be preserved
    - COMMENT: Code comments (may be translated based on config)
    """

    TEXT = auto()
    CODE = auto()
    JSON = auto()
    ROLE_MARKER = auto()
    CONTROL_TOKEN = auto()
    DELIMITER = auto()
    WHITESPACE = auto()
    COMMENT = auto()

pretok.segment.PromptLexer

Lexer for parsing prompts into segments.

The lexer identifies different types of content in a prompt: - Role markers (e.g., <|user|>, [INST]) - Control tokens (e.g., <s>, </s>) - Code blocks - JSON content - Regular text

Example

lexer = PromptLexer() segments = lexer.lex("<|im_start|>user\nHello!<|im_end|>") [s.type for s in segments] [SegmentType.ROLE_MARKER, SegmentType.TEXT, SegmentType.ROLE_MARKER]

Source code in src/pretok/segment/lexer.py
class PromptLexer:
    """Lexer for parsing prompts into segments.

    The lexer identifies different types of content in a prompt:
    - Role markers (e.g., `<|user|>`, `[INST]`)
    - Control tokens (e.g., `<s>`, `</s>`)
    - Code blocks
    - JSON content
    - Regular text

    Example:
        >>> lexer = PromptLexer()
        >>> segments = lexer.lex("<|im_start|>user\\nHello!<|im_end|>")
        >>> [s.type for s in segments]
        [`SegmentType.ROLE_MARKER`, `SegmentType.TEXT`, `SegmentType.ROLE_MARKER`]
    """

    def __init__(
        self,
        config: SegmentConfig | None = None,
        format_hint: str | None = None,
    ) -> None:
        """Initialize the lexer.

        Args:
            config: Optional segment configuration
            format_hint: Hint for prompt format ('chatml', 'llama', etc.)
        """
        self._config = config
        self._format_hint = format_hint or (config.format_hint if config else None)
        self._patterns = self._build_patterns()

    def _build_patterns(self) -> list[tuple[re.Pattern[str], SegmentType]]:
        """Build compiled regex patterns for tokenization."""
        patterns: list[tuple[re.Pattern[str], SegmentType]] = []

        # Add format-specific patterns if format is specified
        if self._format_hint and self._format_hint in PROMPT_FORMATS:
            fmt = PROMPT_FORMATS[self._format_hint]
            for p in fmt.role_patterns:
                patterns.append((re.compile(p), SegmentType.ROLE_MARKER))
            for p in fmt.control_patterns:
                patterns.append((re.compile(p), SegmentType.CONTROL_TOKEN))
            for p in fmt.delimiter_patterns:
                patterns.append((re.compile(p), SegmentType.DELIMITER))
        else:
            # Add all known patterns if no format specified
            for fmt in PROMPT_FORMATS.values():
                for p in fmt.role_patterns:
                    patterns.append((re.compile(p), SegmentType.ROLE_MARKER))
                for p in fmt.control_patterns:
                    patterns.append((re.compile(p), SegmentType.CONTROL_TOKEN))
                for p in fmt.delimiter_patterns:
                    patterns.append((re.compile(p), SegmentType.DELIMITER))

        # Add custom markers from config
        if self._config and self._config.custom_markers:
            for marker in self._config.custom_markers:
                pattern = marker.pattern
                if not marker.is_regex:
                    pattern = re.escape(pattern)

                seg_type = SegmentType[marker.type]
                patterns.append((re.compile(pattern), seg_type))

        # Code block patterns (markdown-style)
        patterns.extend(
            [
                # Fenced code blocks with language
                (re.compile(r"```[\w]*\n[\s\S]*?```", re.MULTILINE), SegmentType.CODE),
                # Fenced code blocks without language
                (re.compile(r"```[\s\S]*?```", re.MULTILINE), SegmentType.CODE),
                # Inline code
                (re.compile(r"`[^`\n]+`"), SegmentType.CODE),
            ]
        )

        # JSON detection (simple heuristic)
        patterns.append((re.compile(r"\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}"), SegmentType.JSON))

        return patterns

    def lex(self, text: str) -> list[Segment]:
        """Parse text into segments.

        Args:
            text: Input text to parse

        Returns:
            List of Segment objects
        """
        if not text:
            return []

        segments: list[Segment] = []
        position = 0

        while position < len(text):
            # Try to match special patterns
            match_found = False

            for pattern, seg_type in self._patterns:
                match = pattern.match(text, position)
                if match:
                    # Add any preceding text
                    if match.start() > position:
                        segments.append(
                            Segment(
                                type=SegmentType.TEXT,
                                content=text[position : match.start()],
                                start=position,
                                end=match.start(),
                            )
                        )

                    # Add the matched segment
                    segments.append(
                        Segment(
                            type=seg_type,
                            content=match.group(),
                            start=match.start(),
                            end=match.end(),
                        )
                    )

                    position = match.end()
                    match_found = True
                    break

            if not match_found:
                # Find next potential match start
                next_match_start = len(text)
                for pattern, _ in self._patterns:
                    search = pattern.search(text, position)
                    if search and search.start() < next_match_start:
                        next_match_start = search.start()

                # Add text segment up to next match (or end)
                if next_match_start > position:
                    segments.append(
                        Segment(
                            type=SegmentType.TEXT,
                            content=text[position:next_match_start],
                            start=position,
                            end=next_match_start,
                        )
                    )
                    position = next_match_start

        return self._merge_adjacent_text(segments)

    def _merge_adjacent_text(self, segments: list[Segment]) -> list[Segment]:
        """Merge adjacent TEXT segments.

        Args:
            segments: List of segments

        Returns:
            Merged segment list
        """
        if not segments:
            return segments

        merged: list[Segment] = []

        for seg in segments:
            if merged and merged[-1].type == SegmentType.TEXT and seg.type == SegmentType.TEXT:
                # Merge with previous
                prev = merged[-1]
                merged[-1] = Segment(
                    type=SegmentType.TEXT,
                    content=prev.content + seg.content,
                    start=prev.start,
                    end=seg.end,
                )
            else:
                merged.append(seg)

        return merged

__init__(config=None, format_hint=None)

Initialize the lexer.

Parameters:

Name Type Description Default
config SegmentConfig | None

Optional segment configuration

None
format_hint str | None

Hint for prompt format ('chatml', 'llama', etc.)

None
Source code in src/pretok/segment/lexer.py
def __init__(
    self,
    config: SegmentConfig | None = None,
    format_hint: str | None = None,
) -> None:
    """Initialize the lexer.

    Args:
        config: Optional segment configuration
        format_hint: Hint for prompt format ('chatml', 'llama', etc.)
    """
    self._config = config
    self._format_hint = format_hint or (config.format_hint if config else None)
    self._patterns = self._build_patterns()

lex(text)

Parse text into segments.

Parameters:

Name Type Description Default
text str

Input text to parse

required

Returns:

Type Description
list[Segment]

List of Segment objects

Source code in src/pretok/segment/lexer.py
def lex(self, text: str) -> list[Segment]:
    """Parse text into segments.

    Args:
        text: Input text to parse

    Returns:
        List of Segment objects
    """
    if not text:
        return []

    segments: list[Segment] = []
    position = 0

    while position < len(text):
        # Try to match special patterns
        match_found = False

        for pattern, seg_type in self._patterns:
            match = pattern.match(text, position)
            if match:
                # Add any preceding text
                if match.start() > position:
                    segments.append(
                        Segment(
                            type=SegmentType.TEXT,
                            content=text[position : match.start()],
                            start=position,
                            end=match.start(),
                        )
                    )

                # Add the matched segment
                segments.append(
                    Segment(
                        type=seg_type,
                        content=match.group(),
                        start=match.start(),
                        end=match.end(),
                    )
                )

                position = match.end()
                match_found = True
                break

        if not match_found:
            # Find next potential match start
            next_match_start = len(text)
            for pattern, _ in self._patterns:
                search = pattern.search(text, position)
                if search and search.start() < next_match_start:
                    next_match_start = search.start()

            # Add text segment up to next match (or end)
            if next_match_start > position:
                segments.append(
                    Segment(
                        type=SegmentType.TEXT,
                        content=text[position:next_match_start],
                        start=position,
                        end=next_match_start,
                    )
                )
                position = next_match_start

    return self._merge_adjacent_text(segments)

pretok.segment.lex_prompt(text, *, format_hint=None, config=None)

Convenience function to lex a prompt.

Parameters:

Name Type Description Default
text str

Input text to parse

required
format_hint str | None

Hint for prompt format

None
config SegmentConfig | None

Optional segment configuration

None

Returns:

Type Description
list[Segment]

List of Segment objects

Source code in src/pretok/segment/lexer.py
def lex_prompt(
    text: str,
    *,
    format_hint: str | None = None,
    config: SegmentConfig | None = None,
) -> list[Segment]:
    """Convenience function to lex a prompt.

    Args:
        text: Input text to parse
        format_hint: Hint for prompt format
        config: Optional segment configuration

    Returns:
        List of Segment objects
    """
    lexer = PromptLexer(config=config, format_hint=format_hint)
    return lexer.lex(text)