YAML Metadata Warning: empty or missing yaml metadata in repo card (https://huggingface.co/docs/hub/model-cards#model-card-metadata)

Indic Transliteration Model

A fine-tuned Gemma-3-270M model for transliteration between Indic scripts and Latin/Romanized text.

Model Details

  • Base Model: google/gemma-3-270m
  • Architecture: Gemma-3-270M with custom character-level tokenizer
  • Vocabulary Size: ~1,851 tokens (character-level)
  • Training: Fine-tuned on 100k transliteration pairs
  • Format: [BOS][LANG]{source}[SEP]{target}[EOS]

Supported Languages

The model supports transliteration for the following Indic languages:

  • Assamese
  • Bengali
  • Bodo
  • Dogri
  • Gujarati
  • Hindi
  • Kannada
  • Kashmiri
  • Konkani
  • Maithili
  • Malayalam
  • Manipuri
  • Marathi
  • Nepali
  • Odia
  • Punjabi
  • Sanskrit
  • Santali
  • Sindhi
  • Tamil
  • Telugu
  • Urdu
  • Latin (Romanized output)

Usage

Installation

pip install torch transformers huggingface_hub

Quick Start

from transformers import AutoModelForCausalLM
from char_tokenizer import CharTokenizer

# Load model (custom class handles embedding fix automatically)
model = AutoModelForCausalLM.from_pretrained(
    "your-username/your-model-name",
    trust_remote_code=True,  # Required for custom model class
    torch_dtype=torch.bfloat16,
)

# Load tokenizer
tokenizer = CharTokenizer.load("path/to/char_tokenizer")

# Transliterate
lang_token = tokenizer.get_lang_token("Latin")
prompt = f"{tokenizer.bos_token}{lang_token}नमस्ते{tokenizer.sep_token}"
input_ids = tokenizer.encode_with_special(prompt)
# ... generate and decode

Simple Script

Use the provided inference_simple.py script:

"""
Simple inference script for Indic Transliteration Model.

Usage:
    python inference_simple.py --text "namaste" --target Hindi
    python inference_simple.py --text "नमस्ते" --target Latin
    python inference_simple.py --interactive
"""
from __future__ import annotations

import argparse
import sys
from pathlib import Path

# Force UTF-8 console on Windows
if hasattr(sys.stdout, "reconfigure"):
    sys.stdout.reconfigure(encoding="utf-8")
if hasattr(sys.stderr, "reconfigure"):
    sys.stderr.reconfigure(encoding="utf-8")

import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer

# Import char_tokenizer
try:
    from char_tokenizer import CharTokenizer
except ImportError:
    sys.path.insert(0, str(Path(__file__).parent))
    from char_tokenizer import CharTokenizer


def load_model(model_id: str, device: str | None = None):
    """
    Load the transliteration model.
    
    Args:
        model_id: Hugging Face model ID or local path
        device: Device to use ('cuda' or 'cpu'). If None, auto-detects.
    
    Returns:
        Loaded model and tokenizer
    """
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
    
    print(f"Loading model: {model_id}")
    print(f"Device: {device}")
    
    # Load tokenizer
    if Path(model_id).exists():
        tokenizer_path = Path(model_id) / "char_tokenizer"
        if not tokenizer_path.exists():
            tokenizer_path = Path(model_id)
    else:
        # Hugging Face - download tokenizer
        from huggingface_hub import snapshot_download
        cache_dir = snapshot_download(repo_id=model_id, allow_patterns="char_tokenizer/*")
        tokenizer_path = Path(cache_dir) / "char_tokenizer"
    
    tokenizer = CharTokenizer.load(tokenizer_path)
    print(f"  Tokenizer vocab size: {tokenizer.vocab_size}")
    
    # Load config first to disable weight tying
    from transformers import AutoConfig
    config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
    if config.tie_word_embeddings:
        config.tie_word_embeddings = False
    
    # Load model with trust_remote_code=True to use custom model class
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        config=config,
        trust_remote_code=True,  # This loads the custom model class
        dtype=torch.bfloat16 if device == "cuda" else torch.float32,
        attn_implementation="sdpa",
    )
    model = model.to(device)
    model.eval()
    
    # CRITICAL: Fix embedding wrapper - REQUIRED for Gemma models
    # Even with custom model class, ensure the fix is applied
    if hasattr(model.model, "embed_tokens"):
        old_embed = model.model.embed_tokens
        if "ScaledWordEmbedding" in str(type(old_embed)):
            print("  Fixing embedding wrapper...")
            embed_device = next(old_embed.parameters()).device
            embed_dtype = next(old_embed.parameters()).dtype
            new_embed = nn.Embedding(
                old_embed.num_embeddings,
                old_embed.embedding_dim,
                padding_idx=getattr(old_embed, "padding_idx", None),
                device=embed_device,
                dtype=embed_dtype,
            )
            new_embed.weight.data = old_embed.weight.data.clone()
            model.model.embed_tokens = new_embed
            print("  ✓ Fixed embedding wrapper")
        else:
            print("  Embedding is already correct")
    
    # Fix token IDs in config
    model.config.pad_token_id = tokenizer.pad_id
    model.config.eos_token_id = tokenizer.eos_id
    model.config.bos_token_id = tokenizer.bos_id
    
    print("Ready!")
    return model, tokenizer


def transliterate(
    model,
    tokenizer: CharTokenizer,
    text: str,
    target_language: str = "Latin",
    max_new_tokens: int = 256,
    device: str = "cuda",
) -> str:
    """
    Transliterate text to the target language/script.
    
    Args:
        model: Loaded model
        tokenizer: CharTokenizer instance
        text: Source text to transliterate
        target_language: Target language (e.g., "Hindi", "Latin", "Bengali")
        max_new_tokens: Maximum tokens to generate
        device: Device the model is on
    
    Returns:
        Transliterated text
    """
    lang_token = tokenizer.get_lang_token(target_language)
    prompt = (
        f"{tokenizer.bos_token}"
        f"{lang_token}"
        f"{text}"
        f"{tokenizer.sep_token}"
    )
    
    input_ids = tokenizer.encode_with_special(prompt)
    input_tensor = torch.tensor([input_ids], dtype=torch.long).to(device)
    
    with torch.no_grad():
        output_ids = model.generate(
            input_tensor,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.pad_id,
            eos_token_id=tokenizer.eos_id,
            bos_token_id=tokenizer.bos_id,
        )
    
    generated_ids = output_ids[0, len(input_ids):].tolist()
    
    if tokenizer.eos_id in generated_ids:
        eos_pos = generated_ids.index(tokenizer.eos_id)
        generated_ids = generated_ids[:eos_pos]
    
    result = tokenizer.decode(generated_ids)
    return result


def main():
    parser = argparse.ArgumentParser(description="Transliterate text using the trained model")
    parser.add_argument("--text", "-t", type=str, help="Text to transliterate")
    parser.add_argument("--target", "-l", type=str, default="Latin", help="Target language")
    parser.add_argument("--model-id", type=str, help="Model ID or path (default: auto-detect)")
    parser.add_argument("--interactive", "-i", action="store_true", help="Interactive mode")
    args = parser.parse_args()
    
    # Default model ID
    if args.model_id is None:
        # Try local first, then you can set your HF model ID
        local_path = Path(__file__).parent.parent / "checkpoints" / "gemma3_char_translit" / "final"
        if local_path.exists():
            model_id = str(local_path)
        else:
            model_id = "psidharth567/indic-xlit-270M"  
    else:
        model_id = args.model_id
    
    # Load model
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, tokenizer = load_model(model_id, device)
    
    if args.interactive:
        print("=" * 60)
        print("Transliteration Interactive Mode")
        print("=" * 60)
        print("Commands:")
        print("  Type text to transliterate")
        print("  Use '@lang' to set target language (e.g., @Hindi, @Latin)")
        print("  Type 'quit' or 'exit' to stop")
        print("=" * 60)
        
        target_lang = "Latin"
        print(f"Current target: {target_lang}\n")
        
        while True:
            try:
                user_input = input(">>> ").strip()
            except (EOFError, KeyboardInterrupt):
                print("\nGoodbye!")
                break
            
            if not user_input:
                continue
            
            if user_input.lower() in ("quit", "exit", "q"):
                print("Goodbye!")
                break
            
            if user_input.startswith("@"):
                target_lang = user_input[1:].strip().title()
                print(f"Target set to: {target_lang}")
                continue
            
            result = transliterate(model, tokenizer, user_input, target_lang, device=device)
            print(f"    {result}\n")
    
    elif args.text:
        result = transliterate(model, tokenizer, args.text, args.target, device=device)
        print(f"Input:  {args.text}")
        print(f"Target: {args.target}")
        print(f"Output: {result}")
    else:
        # Demo mode
        print("Demo transliterations:\n")
        demos = [
            ("नमस्ते", "Latin"),
            ("namaste", "Hindi"),
            ("धन्यवाद", "Latin"),
            ("dhanyavaad", "Hindi"),
        ]
        for text, target in demos:
            result = transliterate(model, tokenizer, text, target, device=device)
            print(f"  [{target:8}] {text} -> {result}")


if __name__ == "__main__":
    main()
python inference_simple.py --text "नमस्ते" --target Latin
python inference_simple.py --text "namaste" --target Hindi
python inference_simple.py --interactive

Model Architecture

  • Character-level tokenizer: Each Unicode character is a token
  • Custom vocabulary: Built from transliteration dataset
  • Separate embeddings and LM head: No weight tying
  • Special tokens: [BOS], [EOS], [SEP], [PAD], [UNK], and language tokens like [HINDI], [BENGALI], etc.
Downloads last month
23
Safetensors
Model size
0.1B params
Tensor type
BF16
·
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support