mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-21 04:39:06 +00:00
fix: waveform can generate NaN in json database (#481)
* refactor: fixes transcript duration type, NaN in waveform, and prepare for postgres migration * fix: ensure we don't have NaN in waveform * fix: missing assertionerror Co-authored-by: pr-agent-monadical[bot] <198624643+pr-agent-monadical[bot]@users.noreply.github.com> * fix: potential empty array --------- Co-authored-by: pr-agent-monadical[bot] <198624643+pr-agent-monadical[bot]@users.noreply.github.com>
This commit is contained in:
@@ -24,6 +24,10 @@ target_metadata = metadata
|
||||
# ... etc.
|
||||
|
||||
|
||||
# don't use asyncpg for the moment
|
||||
settings.DATABASE_URL = settings.DATABASE_URL.replace("+asyncpg", "")
|
||||
|
||||
|
||||
def run_migrations_offline() -> None:
|
||||
"""Run migrations in 'offline' mode.
|
||||
|
||||
|
||||
@@ -0,0 +1,40 @@
|
||||
"""fix transcript duration type
|
||||
|
||||
Revision ID: 2cf0b60a9d34
|
||||
Revises: ccd68dc784ff
|
||||
Create Date: 2025-07-15 16:53:40.397394
|
||||
|
||||
"""
|
||||
from typing import Sequence, Union
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = '2cf0b60a9d34'
|
||||
down_revision: Union[str, None] = 'ccd68dc784ff'
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
with op.batch_alter_table('transcript', schema=None) as batch_op:
|
||||
batch_op.alter_column('duration',
|
||||
existing_type=sa.INTEGER(),
|
||||
type_=sa.Float(),
|
||||
existing_nullable=True)
|
||||
|
||||
# ### end Alembic commands ###
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
with op.batch_alter_table('transcript', schema=None) as batch_op:
|
||||
batch_op.alter_column('duration',
|
||||
existing_type=sa.Float(),
|
||||
type_=sa.INTEGER(),
|
||||
existing_nullable=True)
|
||||
|
||||
# ### end Alembic commands ###
|
||||
@@ -0,0 +1,73 @@
|
||||
"""fix_transcript_json_nan_values
|
||||
|
||||
Revision ID: 88d292678ba2
|
||||
Revises: 2cf0b60a9d34
|
||||
Create Date: 2025-07-15 19:30:19.876332
|
||||
|
||||
"""
|
||||
|
||||
from typing import Sequence, Union
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = "88d292678ba2"
|
||||
down_revision: Union[str, None] = "2cf0b60a9d34"
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
import json
|
||||
import re
|
||||
from sqlalchemy import text
|
||||
|
||||
# Get database connection
|
||||
conn = op.get_bind()
|
||||
|
||||
# Fetch all transcript records with events data
|
||||
result = conn.execute(
|
||||
text("SELECT id, events FROM transcript WHERE events IS NOT NULL")
|
||||
)
|
||||
|
||||
def fix_nan(obj):
|
||||
if isinstance(obj, dict):
|
||||
for key, value in obj.items():
|
||||
if isinstance(value, (dict, list)):
|
||||
fix_nan(value)
|
||||
elif isinstance(value, float) and value != value:
|
||||
obj[key] = None
|
||||
elif isinstance(obj, list):
|
||||
for i in range(len(obj)):
|
||||
if isinstance(obj[i], (dict, list)):
|
||||
fix_nan(obj[i])
|
||||
elif isinstance(obj[i], float) and obj[i] != obj[i]:
|
||||
obj[i] = None
|
||||
|
||||
for transcript_id, events in result:
|
||||
if not events:
|
||||
continue
|
||||
if "NaN" not in events:
|
||||
continue
|
||||
|
||||
try:
|
||||
jevents = json.loads(events)
|
||||
fix_nan(jevents)
|
||||
fixed_events = json.dumps(jevents)
|
||||
assert "NaN" not in fixed_events
|
||||
except (json.JSONDecodeError, AssertionError) as e:
|
||||
print(f"Warning: Invalid JSON for transcript {transcript_id}, skipping: {e}")
|
||||
continue
|
||||
|
||||
# Update the record with fixed JSON
|
||||
conn.execute(
|
||||
text("UPDATE transcript SET events = :events WHERE id = :id"),
|
||||
{"events": fixed_events, "id": transcript_id},
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# No downgrade needed - this is a data fix
|
||||
pass
|
||||
@@ -0,0 +1,39 @@
|
||||
"""transcript composite index
|
||||
|
||||
Revision ID: a9c9c229ee36
|
||||
Revises: 88d292678ba2
|
||||
Create Date: 2025-07-15 20:09:40.253018
|
||||
|
||||
"""
|
||||
|
||||
from typing import Sequence, Union
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = "a9c9c229ee36"
|
||||
down_revision: Union[str, None] = "88d292678ba2"
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
with op.batch_alter_table("transcript", schema=None) as batch_op:
|
||||
batch_op.create_index(
|
||||
"idx_transcript_user_id_recording_id",
|
||||
["user_id", "recording_id"],
|
||||
unique=False,
|
||||
)
|
||||
|
||||
# ### end Alembic commands ###
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
with op.batch_alter_table("transcript", schema=None) as batch_op:
|
||||
batch_op.drop_index("idx_transcript_user_id_recording_id")
|
||||
|
||||
# ### end Alembic commands ###
|
||||
@@ -12,9 +12,10 @@ import reflector.db.recordings # noqa
|
||||
import reflector.db.rooms # noqa
|
||||
import reflector.db.transcripts # noqa
|
||||
|
||||
engine = sqlalchemy.create_engine(
|
||||
settings.DATABASE_URL, connect_args={"check_same_thread": False}
|
||||
)
|
||||
kwargs = {}
|
||||
if "sqlite" in settings.DATABASE_URL:
|
||||
kwargs["connect_args"] = {"check_same_thread": False}
|
||||
engine = sqlalchemy.create_engine(settings.DATABASE_URL, **kwargs)
|
||||
|
||||
|
||||
@subscribers_startup.append
|
||||
|
||||
@@ -32,16 +32,16 @@ transcripts = sqlalchemy.Table(
|
||||
sqlalchemy.Column("name", sqlalchemy.String),
|
||||
sqlalchemy.Column("status", sqlalchemy.String),
|
||||
sqlalchemy.Column("locked", sqlalchemy.Boolean),
|
||||
sqlalchemy.Column("duration", sqlalchemy.Integer),
|
||||
sqlalchemy.Column("duration", sqlalchemy.Float),
|
||||
sqlalchemy.Column("created_at", sqlalchemy.DateTime),
|
||||
sqlalchemy.Column("title", sqlalchemy.String, nullable=True),
|
||||
sqlalchemy.Column("short_summary", sqlalchemy.String, nullable=True),
|
||||
sqlalchemy.Column("long_summary", sqlalchemy.String, nullable=True),
|
||||
sqlalchemy.Column("title", sqlalchemy.String),
|
||||
sqlalchemy.Column("short_summary", sqlalchemy.String),
|
||||
sqlalchemy.Column("long_summary", sqlalchemy.String),
|
||||
sqlalchemy.Column("topics", sqlalchemy.JSON),
|
||||
sqlalchemy.Column("events", sqlalchemy.JSON),
|
||||
sqlalchemy.Column("participants", sqlalchemy.JSON),
|
||||
sqlalchemy.Column("source_language", sqlalchemy.String, nullable=True),
|
||||
sqlalchemy.Column("target_language", sqlalchemy.String, nullable=True),
|
||||
sqlalchemy.Column("source_language", sqlalchemy.String),
|
||||
sqlalchemy.Column("target_language", sqlalchemy.String),
|
||||
sqlalchemy.Column(
|
||||
"reviewed", sqlalchemy.Boolean, nullable=False, server_default=false()
|
||||
),
|
||||
@@ -63,8 +63,8 @@ transcripts = sqlalchemy.Table(
|
||||
"meeting_id",
|
||||
sqlalchemy.String,
|
||||
),
|
||||
sqlalchemy.Column("recording_id", sqlalchemy.String, nullable=True),
|
||||
sqlalchemy.Column("zulip_message_id", sqlalchemy.Integer, nullable=True),
|
||||
sqlalchemy.Column("recording_id", sqlalchemy.String),
|
||||
sqlalchemy.Column("zulip_message_id", sqlalchemy.Integer),
|
||||
sqlalchemy.Column(
|
||||
"source_kind",
|
||||
Enum(SourceKind, values_callable=lambda obj: [e.value for e in obj]),
|
||||
@@ -73,10 +73,11 @@ transcripts = sqlalchemy.Table(
|
||||
# indicative field: whether associated audio is deleted
|
||||
# the main "audio deleted" is the presence of the audio itself / consents not-given
|
||||
# same field could've been in recording/meeting, and it's maybe even ok to dupe it at need
|
||||
sqlalchemy.Column("audio_deleted", sqlalchemy.Boolean, nullable=True),
|
||||
sqlalchemy.Column("audio_deleted", sqlalchemy.Boolean),
|
||||
sqlalchemy.Index("idx_transcript_recording_id", "recording_id"),
|
||||
sqlalchemy.Index("idx_transcript_user_id", "user_id"),
|
||||
sqlalchemy.Index("idx_transcript_created_at", "created_at"),
|
||||
sqlalchemy.Index("idx_transcript_user_id_recording_id", "user_id", "recording_id"),
|
||||
)
|
||||
|
||||
|
||||
@@ -336,6 +337,7 @@ class TranscriptController:
|
||||
.join(meetings, recordings.c.meeting_id == meetings.c.id, isouter=True)
|
||||
.join(rooms, meetings.c.room_id == rooms.c.id, isouter=True)
|
||||
)
|
||||
|
||||
if user_id:
|
||||
query = query.where(
|
||||
or_(transcripts.c.user_id == user_id, rooms.c.is_shared)
|
||||
@@ -377,6 +379,8 @@ class TranscriptController:
|
||||
if filter_recording:
|
||||
query = query.filter(transcripts.c.status != "recording")
|
||||
|
||||
# print(query.compile(compile_kwargs={"literal_binds": True}))
|
||||
|
||||
if return_query:
|
||||
return query
|
||||
|
||||
|
||||
@@ -57,7 +57,10 @@ def get_audio_waveform(path: Path | str, segments_count: int = 256) -> list[int]
|
||||
|
||||
# number of decimals to use when rounding the peak value
|
||||
digits = 2
|
||||
volumes = np.round(volumes / volumes.max(), digits)
|
||||
if len(volumes) > 0 and volumes.max() > 0:
|
||||
volumes = np.round(volumes / volumes.max(), digits)
|
||||
else:
|
||||
volumes = np.zeros_like(volumes) if len(volumes) > 0 else np.array([])
|
||||
|
||||
return volumes.tolist()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user