YAML Metadata
Warning:
empty or missing yaml metadata in repo card
(https://huggingface.co/docs/hub/model-cards#model-card-metadata)
Indic Transliteration Model
A fine-tuned Gemma-3-270M model for transliteration between Indic scripts and Latin/Romanized text.
Model Details
- Base Model: google/gemma-3-270m
- Architecture: Gemma-3-270M with custom character-level tokenizer
- Vocabulary Size: ~1,851 tokens (character-level)
- Training: Fine-tuned on 100k transliteration pairs
- Format:
[BOS][LANG]{source}[SEP]{target}[EOS]
Supported Languages
The model supports transliteration for the following Indic languages:
- Assamese
- Bengali
- Bodo
- Dogri
- Gujarati
- Hindi
- Kannada
- Kashmiri
- Konkani
- Maithili
- Malayalam
- Manipuri
- Marathi
- Nepali
- Odia
- Punjabi
- Sanskrit
- Santali
- Sindhi
- Tamil
- Telugu
- Urdu
- Latin (Romanized output)
Usage
Installation
pip install torch transformers huggingface_hub
Quick Start
from transformers import AutoModelForCausalLM
from char_tokenizer import CharTokenizer
# Load model (custom class handles embedding fix automatically)
model = AutoModelForCausalLM.from_pretrained(
"your-username/your-model-name",
trust_remote_code=True, # Required for custom model class
torch_dtype=torch.bfloat16,
)
# Load tokenizer
tokenizer = CharTokenizer.load("path/to/char_tokenizer")
# Transliterate
lang_token = tokenizer.get_lang_token("Latin")
prompt = f"{tokenizer.bos_token}{lang_token}नमस्ते{tokenizer.sep_token}"
input_ids = tokenizer.encode_with_special(prompt)
# ... generate and decode
Simple Script
Use the provided inference_simple.py script:
"""
Simple inference script for Indic Transliteration Model.
Usage:
python inference_simple.py --text "namaste" --target Hindi
python inference_simple.py --text "नमस्ते" --target Latin
python inference_simple.py --interactive
"""
from __future__ import annotations
import argparse
import sys
from pathlib import Path
# Force UTF-8 console on Windows
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8")
if hasattr(sys.stderr, "reconfigure"):
sys.stderr.reconfigure(encoding="utf-8")
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer
# Import char_tokenizer
try:
from char_tokenizer import CharTokenizer
except ImportError:
sys.path.insert(0, str(Path(__file__).parent))
from char_tokenizer import CharTokenizer
def load_model(model_id: str, device: str | None = None):
"""
Load the transliteration model.
Args:
model_id: Hugging Face model ID or local path
device: Device to use ('cuda' or 'cpu'). If None, auto-detects.
Returns:
Loaded model and tokenizer
"""
if device is None:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Loading model: {model_id}")
print(f"Device: {device}")
# Load tokenizer
if Path(model_id).exists():
tokenizer_path = Path(model_id) / "char_tokenizer"
if not tokenizer_path.exists():
tokenizer_path = Path(model_id)
else:
# Hugging Face - download tokenizer
from huggingface_hub import snapshot_download
cache_dir = snapshot_download(repo_id=model_id, allow_patterns="char_tokenizer/*")
tokenizer_path = Path(cache_dir) / "char_tokenizer"
tokenizer = CharTokenizer.load(tokenizer_path)
print(f" Tokenizer vocab size: {tokenizer.vocab_size}")
# Load config first to disable weight tying
from transformers import AutoConfig
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
if config.tie_word_embeddings:
config.tie_word_embeddings = False
# Load model with trust_remote_code=True to use custom model class
model = AutoModelForCausalLM.from_pretrained(
model_id,
config=config,
trust_remote_code=True, # This loads the custom model class
dtype=torch.bfloat16 if device == "cuda" else torch.float32,
attn_implementation="sdpa",
)
model = model.to(device)
model.eval()
# CRITICAL: Fix embedding wrapper - REQUIRED for Gemma models
# Even with custom model class, ensure the fix is applied
if hasattr(model.model, "embed_tokens"):
old_embed = model.model.embed_tokens
if "ScaledWordEmbedding" in str(type(old_embed)):
print(" Fixing embedding wrapper...")
embed_device = next(old_embed.parameters()).device
embed_dtype = next(old_embed.parameters()).dtype
new_embed = nn.Embedding(
old_embed.num_embeddings,
old_embed.embedding_dim,
padding_idx=getattr(old_embed, "padding_idx", None),
device=embed_device,
dtype=embed_dtype,
)
new_embed.weight.data = old_embed.weight.data.clone()
model.model.embed_tokens = new_embed
print(" ✓ Fixed embedding wrapper")
else:
print(" Embedding is already correct")
# Fix token IDs in config
model.config.pad_token_id = tokenizer.pad_id
model.config.eos_token_id = tokenizer.eos_id
model.config.bos_token_id = tokenizer.bos_id
print("Ready!")
return model, tokenizer
def transliterate(
model,
tokenizer: CharTokenizer,
text: str,
target_language: str = "Latin",
max_new_tokens: int = 256,
device: str = "cuda",
) -> str:
"""
Transliterate text to the target language/script.
Args:
model: Loaded model
tokenizer: CharTokenizer instance
text: Source text to transliterate
target_language: Target language (e.g., "Hindi", "Latin", "Bengali")
max_new_tokens: Maximum tokens to generate
device: Device the model is on
Returns:
Transliterated text
"""
lang_token = tokenizer.get_lang_token(target_language)
prompt = (
f"{tokenizer.bos_token}"
f"{lang_token}"
f"{text}"
f"{tokenizer.sep_token}"
)
input_ids = tokenizer.encode_with_special(prompt)
input_tensor = torch.tensor([input_ids], dtype=torch.long).to(device)
with torch.no_grad():
output_ids = model.generate(
input_tensor,
max_new_tokens=max_new_tokens,
do_sample=False,
pad_token_id=tokenizer.pad_id,
eos_token_id=tokenizer.eos_id,
bos_token_id=tokenizer.bos_id,
)
generated_ids = output_ids[0, len(input_ids):].tolist()
if tokenizer.eos_id in generated_ids:
eos_pos = generated_ids.index(tokenizer.eos_id)
generated_ids = generated_ids[:eos_pos]
result = tokenizer.decode(generated_ids)
return result
def main():
parser = argparse.ArgumentParser(description="Transliterate text using the trained model")
parser.add_argument("--text", "-t", type=str, help="Text to transliterate")
parser.add_argument("--target", "-l", type=str, default="Latin", help="Target language")
parser.add_argument("--model-id", type=str, help="Model ID or path (default: auto-detect)")
parser.add_argument("--interactive", "-i", action="store_true", help="Interactive mode")
args = parser.parse_args()
# Default model ID
if args.model_id is None:
# Try local first, then you can set your HF model ID
local_path = Path(__file__).parent.parent / "checkpoints" / "gemma3_char_translit" / "final"
if local_path.exists():
model_id = str(local_path)
else:
model_id = "psidharth567/indic-xlit-270M"
else:
model_id = args.model_id
# Load model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, tokenizer = load_model(model_id, device)
if args.interactive:
print("=" * 60)
print("Transliteration Interactive Mode")
print("=" * 60)
print("Commands:")
print(" Type text to transliterate")
print(" Use '@lang' to set target language (e.g., @Hindi, @Latin)")
print(" Type 'quit' or 'exit' to stop")
print("=" * 60)
target_lang = "Latin"
print(f"Current target: {target_lang}\n")
while True:
try:
user_input = input(">>> ").strip()
except (EOFError, KeyboardInterrupt):
print("\nGoodbye!")
break
if not user_input:
continue
if user_input.lower() in ("quit", "exit", "q"):
print("Goodbye!")
break
if user_input.startswith("@"):
target_lang = user_input[1:].strip().title()
print(f"Target set to: {target_lang}")
continue
result = transliterate(model, tokenizer, user_input, target_lang, device=device)
print(f" {result}\n")
elif args.text:
result = transliterate(model, tokenizer, args.text, args.target, device=device)
print(f"Input: {args.text}")
print(f"Target: {args.target}")
print(f"Output: {result}")
else:
# Demo mode
print("Demo transliterations:\n")
demos = [
("नमस्ते", "Latin"),
("namaste", "Hindi"),
("धन्यवाद", "Latin"),
("dhanyavaad", "Hindi"),
]
for text, target in demos:
result = transliterate(model, tokenizer, text, target, device=device)
print(f" [{target:8}] {text} -> {result}")
if __name__ == "__main__":
main()
python inference_simple.py --text "नमस्ते" --target Latin
python inference_simple.py --text "namaste" --target Hindi
python inference_simple.py --interactive
Model Architecture
- Character-level tokenizer: Each Unicode character is a token
- Custom vocabulary: Built from transliteration dataset
- Separate embeddings and LM head: No weight tying
- Special tokens:
[BOS],[EOS],[SEP],[PAD],[UNK], and language tokens like[HINDI],[BENGALI], etc.
- Downloads last month
- 23
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
🙋
Ask for provider support