update language codes

This commit is contained in:
Gokul Mohanarangan
2023-10-14 17:35:30 +05:30
15 changed files with 1107 additions and 427 deletions

View File

@@ -167,194 +167,196 @@ class Translator:
"""
# TODO: Enhance with complete list of lang codes
seamless_lang_code = {
# Afrikaans
'af': 'afr',
# Amharic
"am": "amh",
'am': 'amh',
# Modern Standard Arabic
"ar": "arb",
'ar': 'arb',
# Moroccan Arabic
# (No 2-letter code)
'ary': 'ary',
# Egyptian Arabic
# (No 2-letter code)
'arz': 'arz',
# Assamese
"as": "asm",
'as': 'asm',
# North Azerbaijani
"az": "azj",
'az': 'azj',
# Belarusian
"be": "bel",
'be': 'bel',
# Bengali
"bn": "ben",
'bn': 'ben',
# Bosnian
"bs": "bos",
'bs': 'bos',
# Bulgarian
"bg": "bul",
'bg': 'bul',
# Catalan
"ca": "cat",
'ca': 'cat',
# Cebuano
"ceb": "ceb",
'ceb': 'ceb',
# Czech
"cs": "ces",
'cs': 'ces',
# Central Kurdish
"ckb": "ckb",
# Mandarin Chinese (Simplified)
"zh": "cmn",
# Mandarin Chinese (Traditional)
# (No separate 2-letter code)
'ku': 'ckb',
# Mandarin Chinese
'cmn': 'cmn_Hant',
# Welsh
"cy": "cym",
'cy': 'cym',
# Danish
"da": "dan",
'da': 'dan',
# German
"de": "deu",
'de': 'deu',
# Greek
"el": "ell",
'el': 'ell',
# English
"en": "eng",
'en': 'eng',
# Estonian
"et": "est",
'et': 'est',
# Basque
"eu": "eus",
'eu': 'eus',
# Finnish
"fi": "fin",
'fi': 'fin',
# French
"fr": "fra",
# West Central Oromo
# (No 2-letter code)
'fr': 'fra',
# Irish
"ga": "gle",
'ga': 'gle',
# West Central Oromo,
'gaz': 'gaz',
# Galician
"gl": "glg",
'gl': 'glg',
# Gujarati
"gu": "guj",
'gu': 'guj',
# Hebrew
"he": "heb",
'he': 'heb',
# Hindi
"hi": "hin",
'hi': 'hin',
# Croatian
"hr": "hrv",
'hr': 'hrv',
# Hungarian
"hu": "hun",
'hu': 'hun',
# Armenian
"hy": "hye",
'hy': 'hye',
# Igbo
"ig": "ibo",
'ig': 'ibo',
# Indonesian
"id": "ind",
'id': 'ind',
# Icelandic
"is": "isl",
'is': 'isl',
# Italian
"it": "ita",
'it': 'ita',
# Javanese
"jv": "jav",
'jv': 'jav',
# Japanese
"ja": "jpn",
'ja': 'jpn',
# Kannada
"kn": "kan",
'kn': 'kan',
# Georgian
"ka": "kat",
'ka': 'kat',
# Kazakh
"kk": "kaz",
'kk': 'kaz',
# Halh Mongolian
# (No 2-letter code)
'khk': 'khk',
# Khmer
"km": "khm",
'km': 'khm',
# Kyrgyz
"ky": "kir",
'ky': 'kir',
# Korean
"ko": "kor",
'ko': 'kor',
# Lao
"lo": "lao",
'lo': 'lao',
# Lithuanian
"lt": "lit",
'lt': 'lit',
# Ganda
"lg": "lug",
'lg': 'lug',
# Luo
"luo": "luo",
'luo': 'luo',
# Standard Latvian
"lv": "lvs",
'lv': 'lvs',
# Maithili
# (No 2-letter code)
'mai': 'mai',
# Malayalam
"ml": "mal",
'ml': 'mal',
# Marathi
"mr": "mar",
'mr': 'mar',
# Macedonian
"mk": "mkd",
'mk': 'mkd',
# Maltese
"mt": "mlt",
'mt': 'mlt',
# Meitei
# (No 2-letter code)
'mni': 'mni',
# Burmese
"my": "mya",
'my': 'mya',
# Dutch
"nl": "nld",
'nl': 'nld',
# Norwegian Nynorsk
"nn": "nno",
'nn': 'nno',
# Norwegian Bokmål
"nb": "nob",
'nb': 'nob',
# Nepali
"ne": "npi",
'ne': 'npi',
# Nyanja
"ny": "nya",
'ny': 'nya',
# Odia
"or": "ory",
'or': 'ory',
# Punjabi
"pa": "pan",
'pa': 'pan',
# Southern Pashto
# (No 2-letter code)
'pbt': 'pbt',
# Western Persian
"fa": "pes",
'pes': 'pes',
# Polish
"pl": "pol",
'pl': 'pol',
# Portuguese
"pt": "por",
'pt': 'por',
# Romanian
"ro": "ron",
'ro': 'ron',
# Russian
"ru": "rus",
'ru': 'rus',
# Slovak
"sk": "slk",
'sk': 'slk',
# Slovenian
"sl": "slv",
'sl': 'slv',
# Shona
"sn": "sna",
'sn': 'sna',
# Sindhi
"sd": "snd",
'sd': 'snd',
# Somali
"so": "som",
'so': 'som',
# Spanish
"es": "spa",
'es': 'spa',
# Serbian
"sr": "srp",
'sr': 'srp',
# Swedish
"sv": "swe",
'sv': 'swe',
# Swahili
"sw": "swh",
'sw': 'swh',
# Tamil
"ta": "tam",
'ta': 'tam',
# Telugu
"te": "tel",
'te': 'tel',
# Tajik
"tg": "tgk",
'tg': 'tgk',
# Tagalog
"tl": "tgl",
'tl': 'tgl',
# Thai
"th": "tha",
'th': 'tha',
# Turkish
"tr": "tur",
'tr': 'tur',
# Ukrainian
"uk": "ukr",
'uk': 'ukr',
# Urdu
"ur": "urd",
'ur': 'urd',
# Northern Uzbek
"uz": "uzn",
'uz': 'uzn',
# Vietnamese
"vi": "vie",
'vi': 'vie',
# Yoruba
"yo": "yor",
'yo': 'yor',
# Cantonese
# (No separate 2-letter code)
'yue': 'yue',
# Standard Malay
'ms': 'zsm',
# Zulu
"zu": "zul",
'zu': 'zul'
}
return seamless_lang_code.get(lang_code, "eng")
@@ -408,10 +410,10 @@ def web():
result: dict
@app.post("/translate", dependencies=[Depends(apikey_auth)])
def translate(
text: str,
source_language: Annotated[str, Body(...)] = "en",
target_language: Annotated[str, Body(...)] = "fr",
async def translate(
text: str,
source_language: Annotated[str, Body(...)] = "en",
target_language: Annotated[str, Body(...)] = "fr",
) -> TranslateResponse:
func = translatorstub.translate_text.spawn(
text=text,

View File

@@ -14,14 +14,15 @@ API will be a POST request to TRANSCRIPT_URL:
"""
from pathlib import Path
import httpx
from reflector.processors.audio_transcript import AudioTranscriptProcessor
from reflector.processors.audio_transcript_auto import AudioTranscriptAutoProcessor
from reflector.processors.types import AudioFile, Transcript, Word
from reflector.settings import settings
from reflector.storage import Storage
from reflector.utils.retry import retry
from pathlib import Path
import httpx
class AudioTranscriptBananaProcessor(AudioTranscriptProcessor):

View File

@@ -1,7 +1,7 @@
from faster_whisper import WhisperModel
from reflector.processors.audio_transcript import AudioTranscriptProcessor
from reflector.processors.audio_transcript_auto import AudioTranscriptAutoProcessor
from reflector.processors.types import AudioFile, Transcript, Word
from faster_whisper import WhisperModel
class AudioTranscriptWhisperProcessor(AudioTranscriptProcessor):

View File

@@ -117,113 +117,204 @@ class FinalTitle(BaseModel):
title: str
# https://github.com/facebookresearch/seamless_communication/tree/main/scripts/m4t/predict#supported-languages
class TranslationLanguages(BaseModel):
language_to_id_mapping: dict = {
"Afrikaans": "af",
"Albanian": "sq",
"Amharic": "am",
"Arabic": "ar",
"Armenian": "hy",
"Asturian": "ast",
"Azerbaijani": "az",
"Bashkir": "ba",
"Belarusian": "be",
"Bengali": "bn",
"Bosnian": "bs",
"Breton": "br",
"Bulgarian": "bg",
"Burmese": "my",
"Catalan; Valencian": "ca",
"Cebuano": "ceb",
"Central Khmer": "km",
"Chinese": "zh",
"Croatian": "hr",
"Czech": "cs",
"Danish": "da",
"Dutch; Flemish": "nl",
"English": "en",
"Estonian": "et",
"Finnish": "fi",
"French": "fr",
"Fulah": "ff",
"Gaelic; Scottish Gaelic": "gd",
"Galician": "gl",
"Ganda": "lg",
"Georgian": "ka",
"German": "de",
"Greeek": "el",
"Gujarati": "gu",
"Haitian; Haitian Creole": "ht",
"Hausa": "ha",
"Hebrew": "he",
"Hindi": "hi",
"Hungarian": "hu",
"Icelandic": "is",
"Igbo": "ig",
"Iloko": "ilo",
"Indonesian": "id",
"Irish": "ga",
"Italian": "it",
"Japanese": "ja",
"Javanese": "jv",
"Kannada": "kn",
"Kazakh": "kk",
"Korean": "ko",
"Lao": "lo",
"Latvian": "lv",
"Lingala": "ln",
"Lithuanian": "lt",
"Luxembourgish; Letzeburgesch": "lb",
"Macedonian": "mk",
"Malagasy": "mg",
"Malay": "ms",
"Malayalam": "ml",
"Marathi": "mr",
"Mongolian": "mn",
"Nepali": "ne",
"Northern Sotho": "ns",
"Norwegian": "no",
"Occitan": "oc",
"Oriya": "or",
"Panjabi; Punjabi": "pa",
"Persian": "fa",
"Polish": "pl",
"Portuguese": "pt",
"Pushto; Pashto": "ps",
"Romanian; Moldavian; Moldovan": "ro",
"Russian": "ru",
"Serbian": "sr",
"Sindhi": "sd",
"Sinhala; Sinhalese": "si",
"Slovak": "sk",
"Slovenian": "sl",
"Somali": "so",
"Spanish": "es",
"Sundanese": "su",
"Swahili": "sw",
"Swati": "ss",
"Swedish": "sv",
"Tagalog": "tl",
"Tamil": "ta",
"Thai": "th",
"Tswana": "tn",
"Turkish": "tr",
"Ukrainian": "uk",
"Urdu": "ur",
"Uzbek": "uz",
"Vietnamese": "vi",
"Welsh": "cy",
"Western Frisian": "fy",
"Wolof": "wo",
"Xhosa": "xh",
"Yiddish": "yi",
"Yoruba": "yo",
"Zulu": "zu",
# Afrikaans
"af": "afr",
# Amharic
"am": "amh",
# Modern Standard Arabic
"ar": "arb",
# Moroccan Arabic
"ary": "ary",
# Egyptian Arabic
"arz": "arz",
# Assamese
"as": "asm",
# North Azerbaijani
"az": "azj",
# Belarusian
"be": "bel",
# Bengali
"bn": "ben",
# Bosnian
"bs": "bos",
# Bulgarian
"bg": "bul",
# Catalan
"ca": "cat",
# Cebuano
"ceb": "ceb",
# Czech
"cs": "ces",
# Central Kurdish
"ku": "ckb",
# Mandarin Chinese
"cmn": "cmn_Hant",
# Welsh
"cy": "cym",
# Danish
"da": "dan",
# German
"de": "deu",
# Greek
"el": "ell",
# English
"en": "eng",
# Estonian
"et": "est",
# Basque
"eu": "eus",
# Finnish
"fi": "fin",
# French
"fr": "fra",
# Irish
"ga": "gle",
# West Central Oromo,
"gaz": "gaz",
# Galician
"gl": "glg",
# Gujarati
"gu": "guj",
# Hebrew
"he": "heb",
# Hindi
"hi": "hin",
# Croatian
"hr": "hrv",
# Hungarian
"hu": "hun",
# Armenian
"hy": "hye",
# Igbo
"ig": "ibo",
# Indonesian
"id": "ind",
# Icelandic
"is": "isl",
# Italian
"it": "ita",
# Javanese
"jv": "jav",
# Japanese
"ja": "jpn",
# Kannada
"kn": "kan",
# Georgian
"ka": "kat",
# Kazakh
"kk": "kaz",
# Halh Mongolian
"khk": "khk",
# Khmer
"km": "khm",
# Kyrgyz
"ky": "kir",
# Korean
"ko": "kor",
# Lao
"lo": "lao",
# Lithuanian
"lt": "lit",
# Ganda
"lg": "lug",
# Luo
"luo": "luo",
# Standard Latvian
"lv": "lvs",
# Maithili
"mai": "mai",
# Malayalam
"ml": "mal",
# Marathi
"mr": "mar",
# Macedonian
"mk": "mkd",
# Maltese
"mt": "mlt",
# Meitei
"mni": "mni",
# Burmese
"my": "mya",
# Dutch
"nl": "nld",
# Norwegian Nynorsk
"nn": "nno",
# Norwegian Bokmål
"nb": "nob",
# Nepali
"ne": "npi",
# Nyanja
"ny": "nya",
# Odia
"or": "ory",
# Punjabi
"pa": "pan",
# Southern Pashto
"pbt": "pbt",
# Western Persian
"pes": "pes",
# Polish
"pl": "pol",
# Portuguese
"pt": "por",
# Romanian
"ro": "ron",
# Russian
"ru": "rus",
# Slovak
"sk": "slk",
# Slovenian
"sl": "slv",
# Shona
"sn": "sna",
# Sindhi
"sd": "snd",
# Somali
"so": "som",
# Spanish
"es": "spa",
# Serbian
"sr": "srp",
# Swedish
"sv": "swe",
# Swahili
"sw": "swh",
# Tamil
"ta": "tam",
# Telugu
"te": "tel",
# Tajik
"tg": "tgk",
# Tagalog
"tl": "tgl",
# Thai
"th": "tha",
# Turkish
"tr": "tur",
# Ukrainian
"uk": "ukr",
# Urdu
"ur": "urd",
# Northern Uzbek
"uz": "uzn",
# Vietnamese
"vi": "vie",
# Yoruba
"yo": "yor",
# Cantonese
"yue": "yue",
# Standard Malay
"ms": "zsm",
# Zulu
"zu": "zul",
}
@property
def supported_languages(self):
return self.language_to_id_mapping.values()
return self.language_to_id_mapping.keys()
def is_supported(self, lang_id: str) -> bool:
if lang_id in self.supported_languages:

View File

@@ -1,7 +1,6 @@
import asyncio
import av
from reflector.logger import logger
from reflector.processors import (
AudioChunkerProcessor,