mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-20 20:29:06 +00:00
fix: waveform can generate NaN in json database (#481)
* refactor: fixes transcript duration type, NaN in waveform, and prepare for postgres migration * fix: ensure we don't have NaN in waveform * fix: missing assertionerror Co-authored-by: pr-agent-monadical[bot] <198624643+pr-agent-monadical[bot]@users.noreply.github.com> * fix: potential empty array --------- Co-authored-by: pr-agent-monadical[bot] <198624643+pr-agent-monadical[bot]@users.noreply.github.com>
This commit is contained in:
@@ -24,6 +24,10 @@ target_metadata = metadata
|
||||
# ... etc.
|
||||
|
||||
|
||||
# don't use asyncpg for the moment
|
||||
settings.DATABASE_URL = settings.DATABASE_URL.replace("+asyncpg", "")
|
||||
|
||||
|
||||
def run_migrations_offline() -> None:
|
||||
"""Run migrations in 'offline' mode.
|
||||
|
||||
|
||||
@@ -0,0 +1,40 @@
|
||||
"""fix transcript duration type
|
||||
|
||||
Revision ID: 2cf0b60a9d34
|
||||
Revises: ccd68dc784ff
|
||||
Create Date: 2025-07-15 16:53:40.397394
|
||||
|
||||
"""
|
||||
from typing import Sequence, Union
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = '2cf0b60a9d34'
|
||||
down_revision: Union[str, None] = 'ccd68dc784ff'
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
with op.batch_alter_table('transcript', schema=None) as batch_op:
|
||||
batch_op.alter_column('duration',
|
||||
existing_type=sa.INTEGER(),
|
||||
type_=sa.Float(),
|
||||
existing_nullable=True)
|
||||
|
||||
# ### end Alembic commands ###
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
with op.batch_alter_table('transcript', schema=None) as batch_op:
|
||||
batch_op.alter_column('duration',
|
||||
existing_type=sa.Float(),
|
||||
type_=sa.INTEGER(),
|
||||
existing_nullable=True)
|
||||
|
||||
# ### end Alembic commands ###
|
||||
@@ -0,0 +1,73 @@
|
||||
"""fix_transcript_json_nan_values
|
||||
|
||||
Revision ID: 88d292678ba2
|
||||
Revises: 2cf0b60a9d34
|
||||
Create Date: 2025-07-15 19:30:19.876332
|
||||
|
||||
"""
|
||||
|
||||
from typing import Sequence, Union
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = "88d292678ba2"
|
||||
down_revision: Union[str, None] = "2cf0b60a9d34"
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
import json
|
||||
import re
|
||||
from sqlalchemy import text
|
||||
|
||||
# Get database connection
|
||||
conn = op.get_bind()
|
||||
|
||||
# Fetch all transcript records with events data
|
||||
result = conn.execute(
|
||||
text("SELECT id, events FROM transcript WHERE events IS NOT NULL")
|
||||
)
|
||||
|
||||
def fix_nan(obj):
|
||||
if isinstance(obj, dict):
|
||||
for key, value in obj.items():
|
||||
if isinstance(value, (dict, list)):
|
||||
fix_nan(value)
|
||||
elif isinstance(value, float) and value != value:
|
||||
obj[key] = None
|
||||
elif isinstance(obj, list):
|
||||
for i in range(len(obj)):
|
||||
if isinstance(obj[i], (dict, list)):
|
||||
fix_nan(obj[i])
|
||||
elif isinstance(obj[i], float) and obj[i] != obj[i]:
|
||||
obj[i] = None
|
||||
|
||||
for transcript_id, events in result:
|
||||
if not events:
|
||||
continue
|
||||
if "NaN" not in events:
|
||||
continue
|
||||
|
||||
try:
|
||||
jevents = json.loads(events)
|
||||
fix_nan(jevents)
|
||||
fixed_events = json.dumps(jevents)
|
||||
assert "NaN" not in fixed_events
|
||||
except (json.JSONDecodeError, AssertionError) as e:
|
||||
print(f"Warning: Invalid JSON for transcript {transcript_id}, skipping: {e}")
|
||||
continue
|
||||
|
||||
# Update the record with fixed JSON
|
||||
conn.execute(
|
||||
text("UPDATE transcript SET events = :events WHERE id = :id"),
|
||||
{"events": fixed_events, "id": transcript_id},
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# No downgrade needed - this is a data fix
|
||||
pass
|
||||
@@ -0,0 +1,39 @@
|
||||
"""transcript composite index
|
||||
|
||||
Revision ID: a9c9c229ee36
|
||||
Revises: 88d292678ba2
|
||||
Create Date: 2025-07-15 20:09:40.253018
|
||||
|
||||
"""
|
||||
|
||||
from typing import Sequence, Union
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = "a9c9c229ee36"
|
||||
down_revision: Union[str, None] = "88d292678ba2"
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
with op.batch_alter_table("transcript", schema=None) as batch_op:
|
||||
batch_op.create_index(
|
||||
"idx_transcript_user_id_recording_id",
|
||||
["user_id", "recording_id"],
|
||||
unique=False,
|
||||
)
|
||||
|
||||
# ### end Alembic commands ###
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
with op.batch_alter_table("transcript", schema=None) as batch_op:
|
||||
batch_op.drop_index("idx_transcript_user_id_recording_id")
|
||||
|
||||
# ### end Alembic commands ###
|
||||
Reference in New Issue
Block a user