YAML Metadata Warning: empty or missing yaml metadata in repo card (https://huggingface.co/docs/hub/model-cards#model-card-metadata)

Indic Transliteration Model

A fine-tuned Gemma-3-270M model for transliteration between Indic scripts and Latin/Romanized text.

Model Details

Base Model: google/gemma-3-270m
Architecture: Gemma-3-270M with custom character-level tokenizer
Vocabulary Size: ~1,851 tokens (character-level)
Training: Fine-tuned on 100k transliteration pairs
Format: [BOS][LANG]{source}[SEP]{target}[EOS]

Supported Languages

The model supports transliteration for the following Indic languages:

Assamese
Bengali
Bodo
Dogri
Gujarati
Hindi
Kannada
Kashmiri
Konkani
Maithili
Malayalam
Manipuri
Marathi
Nepali
Odia
Punjabi
Sanskrit
Santali
Sindhi
Tamil
Telugu
Urdu
Latin (Romanized output)

Usage

Installation

pip install torch transformers huggingface_hub

Quick Start

from transformers import AutoModelForCausalLM
from char_tokenizer import CharTokenizer

# Load model (custom class handles embedding fix automatically)
model = AutoModelForCausalLM.from_pretrained(
    "your-username/your-model-name",
    trust_remote_code=True,  # Required for custom model class
    torch_dtype=torch.bfloat16,
)

# Load tokenizer
tokenizer = CharTokenizer.load("path/to/char_tokenizer")

# Transliterate
lang_token = tokenizer.get_lang_token("Latin")
prompt = f"{tokenizer.bos_token}{lang_token}नमस्ते{tokenizer.sep_token}"
input_ids = tokenizer.encode_with_special(prompt)
# ... generate and decode

Simple Script

Use the provided inference_simple.py script:

"""
Simple inference script for Indic Transliteration Model.

Usage:
    python inference_simple.py --text "namaste" --target Hindi
    python inference_simple.py --text "नमस्ते" --target Latin
    python inference_simple.py --interactive
"""
from __future__ import annotations

import argparse
import sys
from pathlib import Path

# Force UTF-8 console on Windows
if hasattr(sys.stdout, "reconfigure"):
    sys.stdout.reconfigure(encoding="utf-8")
if hasattr(sys.stderr, "reconfigure"):
    sys.stderr.reconfigure(encoding="utf-8")

import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer

# Import char_tokenizer
try:
    from char_tokenizer import CharTokenizer
except ImportError:
    sys.path.insert(0, str(Path(__file__).parent))
    from char_tokenizer import CharTokenizer


def load_model(model_id: str, device: str | None = None):
    """
    Load the transliteration model.
    
    Args:
        model_id: Hugging Face model ID or local path
        device: Device to use ('cuda' or 'cpu'). If None, auto-detects.
    
    Returns:
        Loaded model and tokenizer
    """
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
    
    print(f"Loading model: {model_id}")
    print(f"Device: {device}")
    
    # Load tokenizer
    if Path(model_id).exists():
        tokenizer_path = Path(model_id) / "char_tokenizer"
        if not tokenizer_path.exists():
            tokenizer_path = Path(model_id)
    else:
        # Hugging Face - download tokenizer
        from huggingface_hub import snapshot_download
        cache_dir = snapshot_download(repo_id=model_id, allow_patterns="char_tokenizer/*")
        tokenizer_path = Path(cache_dir) / "char_tokenizer"
    
    tokenizer = CharTokenizer.load(tokenizer_path)
    print(f"  Tokenizer vocab size: {tokenizer.vocab_size}")
    
    # Load config first to disable weight tying
    from transformers import AutoConfig
    config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
    if config.tie_word_embeddings:
        config.tie_word_embeddings = False
    
    # Load model with trust_remote_code=True to use custom model class
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        config=config,
        trust_remote_code=True,  # This loads the custom model class
        dtype=torch.bfloat16 if device == "cuda" else torch.float32,
        attn_implementation="sdpa",
    )
    model = model.to(device)
    model.eval()
    
    # CRITICAL: Fix embedding wrapper - REQUIRED for Gemma models
    # Even with custom model class, ensure the fix is applied
    if hasattr(model.model, "embed_tokens"):
        old_embed = model.model.embed_tokens
        if "ScaledWordEmbedding" in str(type(old_embed)):
            print("  Fixing embedding wrapper...")
            embed_device = next(old_embed.parameters()).device
            embed_dtype = next(old_embed.parameters()).dtype
            new_embed = nn.Embedding(
                old_embed.num_embeddings,
                old_embed.embedding_dim,
                padding_idx=getattr(old_embed, "padding_idx", None),
                device=embed_device,
                dtype=embed_dtype,
            )
            new_embed.weight.data = old_embed.weight.data.clone()
            model.model.embed_tokens = new_embed
            print("  ✓ Fixed embedding wrapper")
        else:
            print("  Embedding is already correct")
    
    # Fix token IDs in config
    model.config.pad_token_id = tokenizer.pad_id
    model.config.eos_token_id = tokenizer.eos_id
    model.config.bos_token_id = tokenizer.bos_id
    
    print("Ready!")
    return model, tokenizer


def transliterate(
    model,
    tokenizer: CharTokenizer,
    text: str,
    target_language: str = "Latin",
    max_new_tokens: int = 256,
    device: str = "cuda",
) -> str:
    """
    Transliterate text to the target language/script.
    
    Args:
        model: Loaded model
        tokenizer: CharTokenizer instance
        text: Source text to transliterate
        target_language: Target language (e.g., "Hindi", "Latin", "Bengali")
        max_new_tokens: Maximum tokens to generate
        device: Device the model is on
    
    Returns:
        Transliterated text
    """
    lang_token = tokenizer.get_lang_token(target_language)
    prompt = (
        f"{tokenizer.bos_token}"
        f"{lang_token}"
        f"{text}"
        f"{tokenizer.sep_token}"
    )
    
    input_ids = tokenizer.encode_with_special(prompt)
    input_tensor = torch.tensor([input_ids], dtype=torch.long).to(device)
    
    with torch.no_grad():
        output_ids = model.generate(
            input_tensor,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.pad_id,
            eos_token_id=tokenizer.eos_id,
            bos_token_id=tokenizer.bos_id,
        )
    
    generated_ids = output_ids[0, len(input_ids):].tolist()
    
    if tokenizer.eos_id in generated_ids:
        eos_pos = generated_ids.index(tokenizer.eos_id)
        generated_ids = generated_ids[:eos_pos]
    
    result = tokenizer.decode(generated_ids)
    return result


def main():
    parser = argparse.ArgumentParser(description="Transliterate text using the trained model")
    parser.add_argument("--text", "-t", type=str, help="Text to transliterate")
    parser.add_argument("--target", "-l", type=str, default="Latin", help="Target language")
    parser.add_argument("--model-id", type=str, help="Model ID or path (default: auto-detect)")
    parser.add_argument("--interactive", "-i", action="store_true", help="Interactive mode")
    args = parser.parse_args()
    
    # Default model ID
    if args.model_id is None:
        # Try local first, then you can set your HF model ID
        local_path = Path(__file__).parent.parent / "checkpoints" / "gemma3_char_translit" / "final"
        if local_path.exists():
            model_id = str(local_path)
        else:
            model_id = "psidharth567/indic-xlit-270M"  
    else:
        model_id = args.model_id
    
    # Load model
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, tokenizer = load_model(model_id, device)
    
    if args.interactive:
        print("=" * 60)
        print("Transliteration Interactive Mode")
        print("=" * 60)
        print("Commands:")
        print("  Type text to transliterate")
        print("  Use '@lang' to set target language (e.g., @Hindi, @Latin)")
        print("  Type 'quit' or 'exit' to stop")
        print("=" * 60)
        
        target_lang = "Latin"
        print(f"Current target: {target_lang}\n")
        
        while True:
            try:
                user_input = input(">>> ").strip()
            except (EOFError, KeyboardInterrupt):
                print("\nGoodbye!")
                break
            
            if not user_input:
                continue
            
            if user_input.lower() in ("quit", "exit", "q"):
                print("Goodbye!")
                break
            
            if user_input.startswith("@"):
                target_lang = user_input[1:].strip().title()
                print(f"Target set to: {target_lang}")
                continue
            
            result = transliterate(model, tokenizer, user_input, target_lang, device=device)
            print(f"    {result}\n")
    
    elif args.text:
        result = transliterate(model, tokenizer, args.text, args.target, device=device)
        print(f"Input:  {args.text}")
        print(f"Target: {args.target}")
        print(f"Output: {result}")
    else:
        # Demo mode
        print("Demo transliterations:\n")
        demos = [
            ("नमस्ते", "Latin"),
            ("namaste", "Hindi"),
            ("धन्यवाद", "Latin"),
            ("dhanyavaad", "Hindi"),
        ]
        for text, target in demos:
            result = transliterate(model, tokenizer, text, target, device=device)
            print(f"  [{target:8}] {text} -> {result}")


if __name__ == "__main__":
    main()

python inference_simple.py --text "नमस्ते" --target Latin
python inference_simple.py --text "namaste" --target Hindi
python inference_simple.py --interactive

Model Architecture

Character-level tokenizer: Each Unicode character is a token
Custom vocabulary: Built from transliteration dataset
Separate embeddings and LM head: No weight tying
Special tokens: [BOS], [EOS], [SEP], [PAD], [UNK], and language tokens like [HINDI], [BENGALI], etc.

Downloads last month: 23

Safetensors

Model size

0.1B params

Tensor type

BF16

Inference Providers NEW

This model isn't deployed by any Inference Provider. 🙋 Ask for provider support