mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-20 20:29:06 +00:00
Merge branch 'main' of github.com:Monadical-SAS/reflector into sara/recorder-memory
This commit is contained in:
1
server/.gitignore
vendored
1
server/.gitignore
vendored
@@ -168,7 +168,6 @@ wordcloud*.png
|
||||
utils/secrets.ini
|
||||
test_samples/
|
||||
# *.wav
|
||||
*.mp3
|
||||
*.m4a
|
||||
.DS_Store/
|
||||
.DS_Store
|
||||
|
||||
@@ -7,16 +7,14 @@ ENV PIP_DEFAULT_TIMEOUT=100 \
|
||||
PYTHONUNBUFFERED=1 \
|
||||
POETRY_VERSION=1.3.1
|
||||
|
||||
# install packages needed for base
|
||||
# RUN apt-get update && apt-get install -y make gettext
|
||||
|
||||
# builder
|
||||
# builder install base dependencies
|
||||
FROM base AS builder
|
||||
WORKDIR /tmp
|
||||
# RUN apt-get install -y build-essential libffi-dev zlib1g-dev
|
||||
COPY pyproject.toml poetry.lock /tmp
|
||||
RUN pip install "poetry==$POETRY_VERSION"
|
||||
RUN python -m venv /venv
|
||||
|
||||
# install application dependencies
|
||||
COPY pyproject.toml poetry.lock /tmp
|
||||
RUN . /venv/bin/activate && poetry config virtualenvs.create false
|
||||
RUN . /venv/bin/activate && poetry install --only main,aws --no-root --no-interaction --no-ansi
|
||||
|
||||
|
||||
2040
server/poetry.lock
generated
2040
server/poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -109,6 +109,7 @@ class LLM:
|
||||
self.m_generate_call = self.m_generate_call.labels(name)
|
||||
self.m_generate_success = self.m_generate_success.labels(name)
|
||||
self.m_generate_failure = self.m_generate_failure.labels(name)
|
||||
self.detokenizer = nltk.tokenize.treebank.TreebankWordDetokenizer()
|
||||
|
||||
@property
|
||||
def tokenizer(self):
|
||||
@@ -193,15 +194,11 @@ class LLM:
|
||||
camel_cased.append(word[0].upper() + word[1:])
|
||||
else:
|
||||
camel_cased.append(word)
|
||||
modified_title = " ".join(camel_cased)
|
||||
modified_title = self.detokenizer.detokenize(camel_cased)
|
||||
|
||||
# The result can have words in braces with additional space.
|
||||
# Change ( ABC ), [ ABC ], etc. ==> (ABC), [ABC], etc.
|
||||
pattern = r"(?<=[\[\{\(])\s+|\s+(?=[\]\}\)])"
|
||||
title = re.sub(pattern, "", modified_title)
|
||||
# Irrespective of casing changes, the starting letter
|
||||
# of title is always upper-cased
|
||||
title = title[0].upper() + title[1:]
|
||||
title = modified_title[0].upper() + modified_title[1:]
|
||||
except Exception as e:
|
||||
reflector_logger.info(
|
||||
f"Failed to ensure casing on {title=} " f"with exception : {str(e)}"
|
||||
|
||||
70
server/reflector/tools/exportdanswer.py
Normal file
70
server/reflector/tools/exportdanswer.py
Normal file
@@ -0,0 +1,70 @@
|
||||
import json
|
||||
import pathlib
|
||||
from datetime import timedelta
|
||||
|
||||
|
||||
async def export_db(filename: str) -> None:
|
||||
from reflector.settings import settings
|
||||
|
||||
filename = pathlib.Path(filename).resolve()
|
||||
settings.DATABASE_URL = f"sqlite:///{filename}"
|
||||
|
||||
from reflector.db import database, transcripts
|
||||
|
||||
await database.connect()
|
||||
transcripts = await database.fetch_all(transcripts.select())
|
||||
await database.disconnect()
|
||||
|
||||
def export_transcript(transcript, output_dir):
|
||||
for topic in transcript.topics:
|
||||
metadata = {
|
||||
"link": f"https://reflector.media/transcripts/{transcript.id}#topic:{topic['id']},timestamp:{topic['timestamp']}",
|
||||
"transcript_id": transcript.id,
|
||||
"transcript_created_at": transcript.created_at.isoformat(),
|
||||
"topic_id": topic["id"],
|
||||
"topic_relative_timestamp": topic["timestamp"],
|
||||
"topic_created_at": (
|
||||
transcript.created_at + timedelta(seconds=topic["timestamp"])
|
||||
).isoformat(),
|
||||
"topic_title": topic["title"],
|
||||
}
|
||||
j_metadata = json.dumps(metadata)
|
||||
|
||||
# export transcript
|
||||
output = output_dir / f"{transcript.id}-topic-{topic['id']}.txt"
|
||||
with open(output, "w", encoding="utf8") as fd:
|
||||
fd.write(f"#DANSWER_METADATA={j_metadata}\n")
|
||||
fd.write("\n")
|
||||
fd.write(f"# {topic['title']}\n")
|
||||
fd.write("\n")
|
||||
fd.write(f"{topic['transcript']}\n")
|
||||
|
||||
# # export summary
|
||||
# output = output_dir / f"{transcript.id}-summary.txt"
|
||||
# metadata = {
|
||||
# "link": f"https://reflector.media/transcripts/{transcript.id}",
|
||||
# "rfl_id": transcript.id,
|
||||
# }
|
||||
#
|
||||
# j_metadata = json.dumps(metadata)
|
||||
# with open(output, "w", encoding="utf8") as fd:
|
||||
# fd.write(f"#DANSWER_METADATA={j_metadata}\n")
|
||||
# fd.write("\n")
|
||||
# fd.write("# Summary\n")
|
||||
# fd.write("\n")
|
||||
# fd.write(f"{transcript.long_summary}\n")
|
||||
|
||||
output_dir = pathlib.Path("exportdanswer")
|
||||
for transcript in transcripts:
|
||||
export_transcript(transcript, output_dir)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import asyncio
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("database", help="Sqlite Database file")
|
||||
args = parser.parse_args()
|
||||
|
||||
asyncio.run(export_db(args.database))
|
||||
64
server/reflector/tools/exportdb.py
Normal file
64
server/reflector/tools/exportdb.py
Normal file
@@ -0,0 +1,64 @@
|
||||
import csv
|
||||
import pathlib
|
||||
|
||||
|
||||
async def export_db(filename: str) -> None:
|
||||
from reflector.settings import settings
|
||||
|
||||
filename = pathlib.Path(filename).resolve()
|
||||
settings.DATABASE_URL = f"sqlite:///{filename}"
|
||||
|
||||
from reflector.db import database, transcripts
|
||||
|
||||
await database.connect()
|
||||
transcripts = await database.fetch_all(transcripts.select())
|
||||
await database.disconnect()
|
||||
|
||||
def export_transcript(transcript):
|
||||
tid = transcript.id
|
||||
yield tid, "title", transcript.title
|
||||
yield tid, "name", transcript.name
|
||||
yield tid, "created_at", transcript.created_at
|
||||
yield tid, "long_summary", transcript.long_summary
|
||||
yield tid, "short_summary", transcript.short_summary
|
||||
yield tid, "source_language", transcript.source_language
|
||||
yield tid, "target_language", transcript.target_language
|
||||
yield tid, "user_id", transcript.user_id
|
||||
yield tid, "status", transcript.status
|
||||
for topic in transcript.topics:
|
||||
yield tid, "topic", topic["id"], "title", topic["title"]
|
||||
yield tid, "topic", topic["id"], "summary", topic["summary"]
|
||||
yield tid, "topic", topic["id"], "timestamp", topic["timestamp"]
|
||||
yield tid, "topic", topic["id"], "transcript", topic["transcript"]
|
||||
|
||||
# extract transcripts
|
||||
for idx, entry in enumerate(transcript.events):
|
||||
if entry["event"] == "TRANSCRIPT":
|
||||
yield tid, "event_transcript", idx, "text", entry["data"]["text"]
|
||||
if entry["data"].get("translation") is not None:
|
||||
yield tid, "event_transcript", idx, "translation", entry[
|
||||
"data"
|
||||
].get("translation", None)
|
||||
|
||||
def export_transcripts(transcripts):
|
||||
for transcript in transcripts:
|
||||
yield from export_transcript(transcript)
|
||||
|
||||
csv_output = pathlib.Path("export.csv").resolve()
|
||||
output = csv.writer(open(csv_output, "w"))
|
||||
output.writerow(["transcript_id", "key", "value", "key", "value"])
|
||||
for row in export_transcripts(transcripts):
|
||||
output.writerow(row)
|
||||
|
||||
print(f"Exported {len(transcripts)} transcripts to {csv_output}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import asyncio
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("database", help="Sqlite Database file")
|
||||
args = parser.parse_args()
|
||||
|
||||
asyncio.run(export_db(args.database))
|
||||
Reference in New Issue
Block a user