replace two letter codes with three letter codes

This commit is contained in:
Sara
2023-10-13 23:36:02 +02:00
parent 47fc52af11
commit 90c6824f52
12 changed files with 91 additions and 140 deletions

View File

@@ -178,7 +178,7 @@ def web():
@app.post("/transcribe", dependencies=[Depends(apikey_auth)])
async def transcribe(
file: UploadFile,
source_language: Annotated[str, Body(...)] = "en",
source_language: Annotated[str, Body(...)] = "eng",
timestamp: Annotated[float, Body()] = 0.0
) -> TranscriptResponse:
audio_data = await file.read()

View File

@@ -219,8 +219,8 @@ def web():
@app.post("/translate", dependencies=[Depends(apikey_auth)])
async def translate(
text: str,
source_language: Annotated[str, Body(...)] = "en",
target_language: Annotated[str, Body(...)] = "fr",
source_language: Annotated[str, Body(...)] = "eng",
target_language: Annotated[str, Body(...)] = "fra",
) -> TranslateResponse:
func = translatorstub.translate_text.spawn(
text=text,

View File

@@ -8,20 +8,21 @@ API will be a POST request to TRANSCRIPT_URL:
"audio_url": "https://...",
"audio_ext": "wav",
"timestamp": 123.456
"language": "en"
"language": "eng"
}
```
"""
from pathlib import Path
import httpx
from reflector.processors.audio_transcript import AudioTranscriptProcessor
from reflector.processors.audio_transcript_auto import AudioTranscriptAutoProcessor
from reflector.processors.types import AudioFile, Transcript, Word
from reflector.settings import settings
from reflector.storage import Storage
from reflector.utils.retry import retry
from pathlib import Path
import httpx
class AudioTranscriptBananaProcessor(AudioTranscriptProcessor):

View File

@@ -5,8 +5,8 @@ API will be a POST request to TRANSCRIPT_URL:
```form
"timestamp": 123.456
"source_language": "en"
"target_language": "en"
"source_language": "eng"
"target_language": "eng"
"file": <audio file>
```
@@ -33,7 +33,7 @@ class AudioTranscriptModalProcessor(AudioTranscriptProcessor):
files = {
"file": (data.name, data.fd),
}
source_language = self.get_pref("audio:source_language", "en")
source_language = self.get_pref("audio:source_language", "eng")
json_payload = {"source_language": source_language}
response = await retry(client.post)(
self.transcript_url,

View File

@@ -1,7 +1,7 @@
from faster_whisper import WhisperModel
from reflector.processors.audio_transcript import AudioTranscriptProcessor
from reflector.processors.audio_transcript_auto import AudioTranscriptAutoProcessor
from reflector.processors.types import AudioFile, Transcript, Word
from faster_whisper import WhisperModel
class AudioTranscriptWhisperProcessor(AudioTranscriptProcessor):
@@ -14,7 +14,7 @@ class AudioTranscriptWhisperProcessor(AudioTranscriptProcessor):
async def _transcript(self, data: AudioFile):
segments, _ = self.model.transcribe(
data.path.as_posix(),
language="en",
language="eng",
beam_size=5,
# condition_on_previous_text=True,
word_timestamps=True,

View File

@@ -28,8 +28,8 @@ class TranscriptTranslatorProcessor(Processor):
# FIXME this should be a processor after, as each user may want
# different languages
source_language = self.get_pref("audio:source_language", "en")
target_language = self.get_pref("audio:target_language", "en")
source_language = self.get_pref("audio:source_language", "eng")
target_language = self.get_pref("audio:target_language", "eng")
if source_language == target_language:
return

View File

@@ -117,113 +117,64 @@ class FinalTitle(BaseModel):
title: str
# https://github.com/facebookresearch/seamless_communication/tree/main/scripts/m4t/predict#supported-languages
class TranslationLanguages(BaseModel):
language_to_id_mapping: dict = {
"Afrikaans": "af",
"Albanian": "sq",
"Amharic": "am",
"Arabic": "ar",
"Armenian": "hy",
"Asturian": "ast",
"Azerbaijani": "az",
"Bashkir": "ba",
"Belarusian": "be",
"Bengali": "bn",
"Bosnian": "bs",
"Breton": "br",
"Bulgarian": "bg",
"Burmese": "my",
"Catalan; Valencian": "ca",
"Cebuano": "ceb",
"Central Khmer": "km",
"Chinese": "zh",
"Croatian": "hr",
"Czech": "cs",
"Danish": "da",
"Dutch; Flemish": "nl",
"English": "en",
"Estonian": "et",
"Finnish": "fi",
"French": "fr",
"Fulah": "ff",
"Gaelic; Scottish Gaelic": "gd",
"Galician": "gl",
"Ganda": "lg",
"Georgian": "ka",
"German": "de",
"Greeek": "el",
"Gujarati": "gu",
"Haitian; Haitian Creole": "ht",
"Hausa": "ha",
"Hebrew": "he",
"Hindi": "hi",
"Hungarian": "hu",
"Icelandic": "is",
"Igbo": "ig",
"Iloko": "ilo",
"Indonesian": "id",
"Irish": "ga",
"Italian": "it",
"Japanese": "ja",
"Javanese": "jv",
"Kannada": "kn",
"Kazakh": "kk",
"Korean": "ko",
"Lao": "lo",
"Latvian": "lv",
"Lingala": "ln",
"Lithuanian": "lt",
"Luxembourgish; Letzeburgesch": "lb",
"Macedonian": "mk",
"Malagasy": "mg",
"Malay": "ms",
"Malayalam": "ml",
"Marathi": "mr",
"Mongolian": "mn",
"Nepali": "ne",
"Northern Sotho": "ns",
"Norwegian": "no",
"Occitan": "oc",
"Oriya": "or",
"Panjabi; Punjabi": "pa",
"Persian": "fa",
"Polish": "pl",
"Portuguese": "pt",
"Pushto; Pashto": "ps",
"Romanian; Moldavian; Moldovan": "ro",
"Russian": "ru",
"Serbian": "sr",
"Sindhi": "sd",
"Sinhala; Sinhalese": "si",
"Slovak": "sk",
"Slovenian": "sl",
"Somali": "so",
"Spanish": "es",
"Sundanese": "su",
"Swahili": "sw",
"Swati": "ss",
"Swedish": "sv",
"Tagalog": "tl",
"Tamil": "ta",
"Thai": "th",
"Tswana": "tn",
"Turkish": "tr",
"Ukrainian": "uk",
"Urdu": "ur",
"Uzbek": "uz",
"Vietnamese": "vi",
"Welsh": "cy",
"Western Frisian": "fy",
"Wolof": "wo",
"Xhosa": "xh",
"Yiddish": "yi",
"Yoruba": "yo",
"Zulu": "zu",
"afr": "Afrikaans",
"azj": "North Azerbaijani",
"bos": "Bosnian",
"cat": "Catalan",
"ceb": "Cebuano",
"ces": "Czech",
"cym": "Welsh",
"dan": "Danish",
"deu": "German",
"eng": "English",
"est": "Estonian",
"eus": "Basque",
"fin": "Finnish",
"fra": "French",
"gaz": "West Central Oromo",
"gle": "Irish",
"glg": "Galician",
"hrv": "Croatian",
"hun": "Hungarian",
"ibo": "Igbo",
"ind": "Indonesian",
"isl": "Icelandic",
"ita": "Italian",
"jav": "Javanese",
"lit": "Lithuanian",
"lug": "Ganda",
"luo": "Luo",
"lvs": "Standard Latvian",
"mlt": "Maltese",
"nld": "Dutch",
"nno": "Norwegian Nynorsk",
"nob": "Norwegian Bokmål",
"nya": "Nyanja",
"pol": "Polish",
"por": "Portuguese",
"ron": "Romanian",
"slk": "Slovak",
"slv": "Slovenian",
"sna": "Shona",
"som": "Somali",
"spa": "Spanish",
"swe": "Swedish",
"swh": "Swahili",
"tgl": "Tagalog",
"tur": "Turkish",
"uzn": "Northern Uzbek",
"vie": "Vietnamese",
"yor": "Yoruba",
"zsm": "Standard Malay",
"zul": "Zulu",
}
@property
def supported_languages(self):
return self.language_to_id_mapping.values()
return self.language_to_id_mapping.keys()
def is_supported(self, lang_id: str) -> bool:
if lang_id in self.supported_languages:

View File

@@ -1,7 +1,6 @@
import asyncio
import av
from reflector.logger import logger
from reflector.processors import (
AudioChunkerProcessor,
@@ -23,8 +22,8 @@ async def process_audio_file(
filename,
event_callback,
only_transcript=False,
source_language="en",
target_language="en",
source_language="eng",
target_language="eng",
):
# build pipeline for audio processing
processors = [
@@ -73,8 +72,8 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("source", help="Source file (mp3, wav, mp4...)")
parser.add_argument("--only-transcript", "-t", action="store_true")
parser.add_argument("--source-language", default="en")
parser.add_argument("--target-language", default="en")
parser.add_argument("--source-language", default="eng")
parser.add_argument("--target-language", default="eng")
parser.add_argument("--output", "-o", help="Output file (output.jsonl)")
args = parser.parse_args()

View File

@@ -90,8 +90,8 @@ async def rtc_offer_base(
event_callback=None,
event_callback_args=None,
audio_filename: Path | None = None,
source_language: str = "en",
target_language: str = "en",
source_language: str = "eng",
target_language: str = "eng",
):
# build an rtc session
offer = RTCSessionDescription(sdp=params.sdp, type=params.type)

View File

@@ -87,8 +87,8 @@ class Transcript(BaseModel):
long_summary: str | None = None
topics: list[TranscriptTopic] = []
events: list[TranscriptEvent] = []
source_language: str = "en"
target_language: str = "en"
source_language: str = "eng"
target_language: str = "eng"
def add_event(self, event: str, data: BaseModel) -> TranscriptEvent:
ev = TranscriptEvent(event=event, data=data.model_dump())
@@ -170,8 +170,8 @@ class TranscriptController:
async def add(
self,
name: str,
source_language: str = "en",
target_language: str = "en",
source_language: str = "eng",
target_language: str = "eng",
user_id: str | None = None,
):
transcript = Transcript(
@@ -231,8 +231,8 @@ class GetTranscript(BaseModel):
class CreateTranscript(BaseModel):
name: str
source_language: str = Field("en")
target_language: str = Field("en")
source_language: str = Field("eng")
target_language: str = Field("eng")
class UpdateTranscript(BaseModel):

View File

@@ -46,7 +46,7 @@ async def dummy_transcript():
class TestAudioTranscriptProcessor(AudioTranscriptProcessor):
async def _transcript(self, data: AudioFile):
source_language = self.get_pref("audio:source_language", "en")
source_language = self.get_pref("audio:source_language", "eng")
print("transcripting", source_language)
print("pipeline", self.pipeline)
print("prefs", self.pipeline.prefs)

View File

@@ -10,15 +10,15 @@ async def test_transcript_create_default_translation():
response = await ac.post("/transcripts", json={"name": "test en"})
assert response.status_code == 200
assert response.json()["name"] == "test en"
assert response.json()["source_language"] == "en"
assert response.json()["target_language"] == "en"
assert response.json()["source_language"] == "eng"
assert response.json()["target_language"] == "eng"
tid = response.json()["id"]
response = await ac.get(f"/transcripts/{tid}")
assert response.status_code == 200
assert response.json()["name"] == "test en"
assert response.json()["source_language"] == "en"
assert response.json()["target_language"] == "en"
assert response.json()["source_language"] == "eng"
assert response.json()["target_language"] == "eng"
@pytest.mark.asyncio
@@ -31,15 +31,15 @@ async def test_transcript_create_en_fr_translation():
)
assert response.status_code == 200
assert response.json()["name"] == "test en/fr"
assert response.json()["source_language"] == "en"
assert response.json()["target_language"] == "fr"
assert response.json()["source_language"] == "eng"
assert response.json()["target_language"] == "fra"
tid = response.json()["id"]
response = await ac.get(f"/transcripts/{tid}")
assert response.status_code == 200
assert response.json()["name"] == "test en/fr"
assert response.json()["source_language"] == "en"
assert response.json()["target_language"] == "fr"
assert response.json()["source_language"] == "eng"
assert response.json()["target_language"] == "fra"
@pytest.mark.asyncio
@@ -52,12 +52,12 @@ async def test_transcript_create_fr_en_translation():
)
assert response.status_code == 200
assert response.json()["name"] == "test fr/en"
assert response.json()["source_language"] == "fr"
assert response.json()["target_language"] == "en"
assert response.json()["source_language"] == "fra"
assert response.json()["target_language"] == "eng"
tid = response.json()["id"]
response = await ac.get(f"/transcripts/{tid}")
assert response.status_code == 200
assert response.json()["name"] == "test fr/en"
assert response.json()["source_language"] == "fr"
assert response.json()["target_language"] == "en"
assert response.json()["source_language"] == "fra"
assert response.json()["target_language"] == "eng"