From 8cab94cda1e71047a68111f4a3d8fd85e3e5e321 Mon Sep 17 00:00:00 2001 From: Gokul Mohanarangan Date: Thu, 10 Aug 2023 11:40:55 +0530 Subject: [PATCH 1/6] add model evaluation feature --- server/tests/evaluate/__init__.py | 0 .../tests/evaluate/evaluate_transcription.py | 220 +++ .../predicted_texts/pred_sample_1.txt | 1 + .../predicted_texts/pred_sample_2.txt | 1 + .../predicted_texts/pred_sample_3.txt | 1 + .../evaluate/reference_texts/ref_sample_1.txt | 1544 +++++++++++++++++ .../evaluate/reference_texts/ref_sample_2.txt | 620 +++++++ .../evaluate/reference_texts/ref_sample_3.txt | 970 +++++++++++ 8 files changed, 3357 insertions(+) create mode 100644 server/tests/evaluate/__init__.py create mode 100644 server/tests/evaluate/evaluate_transcription.py create mode 100644 server/tests/evaluate/predicted_texts/pred_sample_1.txt create mode 100644 server/tests/evaluate/predicted_texts/pred_sample_2.txt create mode 100644 server/tests/evaluate/predicted_texts/pred_sample_3.txt create mode 100644 server/tests/evaluate/reference_texts/ref_sample_1.txt create mode 100644 server/tests/evaluate/reference_texts/ref_sample_2.txt create mode 100644 server/tests/evaluate/reference_texts/ref_sample_3.txt diff --git a/server/tests/evaluate/__init__.py b/server/tests/evaluate/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/server/tests/evaluate/evaluate_transcription.py b/server/tests/evaluate/evaluate_transcription.py new file mode 100644 index 00000000..26c1ce7a --- /dev/null +++ b/server/tests/evaluate/evaluate_transcription.py @@ -0,0 +1,220 @@ +import json +import os +import re +from dataclasses import dataclass +from pathlib import Path +from typing import List, Union + +from jiwer import wer +from Levenshtein import distance +from tqdm.auto import tqdm +from whisper.normalizers import EnglishTextNormalizer + + +@dataclass +class EvaluationResult: + """ + Result object of the model evaluation + """ + + accuracy = float + total_test_samples = int + + def __init__(self, accuracy, total_test_samples): + self.accuracy = accuracy + self.total_test_samples = total_test_samples + + def __repr__(self): + return ( + "EvaluationResult(" + + json.dumps( + { + "accuracy": self.accuracy, + "total_test_samples": self.total_test_samples, + } + ) + + ")" + ) + + +@dataclass +class EvaluationTestSample: + """ + Represents one test sample + """ + + reference_text = str + predicted_text = str + + def __init__(self, reference_text, predicted_text): + self.reference_text = reference_text + self.predicted_text = predicted_text + + def update(self, reference_text, predicted_text): + self.reference_text = reference_text + self.predicted_text = predicted_text + + +class TestDatasetLoader: + """ + Test samples loader + """ + + parent_dir = None + total_samples = 0 + + def __init__(self, parent_dir: Union[Path | str]): + if isinstance(parent_dir, str): + self.parent_dir = Path(parent_dir) + else: + self.parent_dir = parent_dir + + def _load_test_data(self) -> tuple[str, str]: + """ + Loader function to validate inout files and generate samples + """ + PREDICTED_TEST_SAMPLES_DIR = self.parent_dir / "predicted_texts" + REFERENCE_TEST_SAMPLES_DIR = self.parent_dir / "reference_texts" + + for filename in os.listdir(PREDICTED_TEST_SAMPLES_DIR.as_posix()): + match = re.search(r"(\d+)\.txt$", filename) + if match: + sample_id = match.group(1) + pred_file_path = (PREDICTED_TEST_SAMPLES_DIR / filename).as_posix() + ref_file_name = "ref_sample_" + str(sample_id) + ".txt" + ref_file_path = (REFERENCE_TEST_SAMPLES_DIR / ref_file_name).as_posix() + if os.path.exists(ref_file_path): + self.total_samples += 1 + yield ref_file_path, pred_file_path + + def __iter__(self) -> EvaluationTestSample: + """ + Iter method for the test loader + """ + for pred_file_path, ref_file_path in self._load_test_data(): + with open(pred_file_path, "r", encoding="utf-8") as file: + pred_text = file.read() + with open(ref_file_path, "r", encoding="utf-8") as file: + ref_text = file.read() + yield EvaluationTestSample(ref_text, pred_text) + + +class ModelEvaluator: + """ + Class that comprises all model evaluation related processes and methods + """ + + # The 2 popular methods of WER differ slightly. More dimensions of accuracy + # will be added. For now, the average of these 2 will serve as the metric. + WEIGHTED_WER_LEVENSHTEIN = 0.0 + WER_LEVENSHTEIN = [] + WEIGHTED_WER_JIWER = 0.0 + WER_JIWER = [] + + normalizer = None + accuracy = None + test_dataset_loader = None + test_directory = None + evaluation_config = {} + + def __init__(self, **kwargs): + self.evaluation_config = {k: v for k, v in kwargs.items() if v is not None} + if "normalizer" not in self.evaluation_config: + self.normalizer = EnglishTextNormalizer() + self.evaluation_config["normalizer"] = str(type(self.normalizer)) + if "parent_dir" not in self.evaluation_config: + self.test_directory = Path(__file__).parent + self.test_dataset_loader = TestDatasetLoader(self.test_directory) + self.evaluation_config["test_directory"] = str(self.test_directory) + + def __repr__(self): + return "ModelEvaluator(" + json.dumps(self.describe(), indent=4) + ")" + + def describe(self) -> dict: + """ + Returns the parameters defining the evaluator + """ + return self.evaluation_config + + def _normalize(self, sample: EvaluationTestSample) -> None: + """ + Normalize both reference and predicted text + """ + sample.update( + self.normalizer(sample.reference_text), + self.normalizer(sample.predicted_text), + ) + + def _calculate_wer(self, sample: EvaluationTestSample) -> float: + """ + Based on weights for (insert, delete, substitute), calculate + the Word Error Rate + """ + levenshtein_distance = distance( + s1=sample.reference_text, + s2=sample.predicted_text, + weights=( + self.evaluation_config["insertion_penalty"], + self.evaluation_config["deletion_penalty"], + self.evaluation_config["substitution_penalty"], + ), + ) + wer = levenshtein_distance / len(sample.reference_text) + return wer + + def _calculate_wers(self) -> None: + """ + Compute WER + """ + for sample in tqdm(self.test_dataset_loader, desc="Evaluating", ncols=100): + self._normalize(sample) + wer_item_l = { + "wer": self._calculate_wer(sample), + "no_of_words": len(sample.reference_text), + } + wer_item_j = { + "wer": wer(sample.reference_text, sample.predicted_text), + "no_of_words": len(sample.reference_text), + } + self.WER_LEVENSHTEIN.append(wer_item_l) + self.WER_JIWER.append(wer_item_j) + + def _calculate_weighted_wer(self, wers: List[float]) -> float: + """ + Calculate the weighted WER from WER + """ + total_wer = 0.0 + total_words = 0.0 + for item in wers: + total_wer += item["no_of_words"] * item["wer"] + total_words += item["no_of_words"] + return total_wer / total_words + + def _calculate_model_accuracy(self) -> None: + """ + Compute model accuracy + """ + self._calculate_wers() + weighted_wer_levenshtein = self._calculate_weighted_wer(self.WER_LEVENSHTEIN) + weighted_wer_jiwer = self._calculate_weighted_wer(self.WER_JIWER) + + final_weighted_wer = (weighted_wer_levenshtein + weighted_wer_jiwer) / 2 + self.accuracy = (1 - final_weighted_wer) * 100 + + def evaluate(self, recalculate: bool = False) -> EvaluationResult: + """ + Triggers the model evaluation + """ + if not self.accuracy or recalculate: + self._calculate_model_accuracy() + return EvaluationResult(self.accuracy, self.test_dataset_loader.total_samples) + + +eval_config = {"insertion_penalty": 1, "deletion_penalty": 2, "substitution_penalty": 1} + +evaluator = ModelEvaluator(**eval_config) +evaluation = evaluator.evaluate() + +print(evaluator) +print(evaluation) +print("Model accuracy : {:.2f} %".format(evaluation.accuracy)) diff --git a/server/tests/evaluate/predicted_texts/pred_sample_1.txt b/server/tests/evaluate/predicted_texts/pred_sample_1.txt new file mode 100644 index 00000000..d5fa2fa1 --- /dev/null +++ b/server/tests/evaluate/predicted_texts/pred_sample_1.txt @@ -0,0 +1 @@ +We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI . Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development . Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations . Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude . Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council . Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas . Thank you for having me . You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . So , it 's a busy conference , but lots of exciting stuff going on . Yeah , it 's incredible . I mean , I want to zoom out for a second to start with , which is that this is obviously not your first time taking and packaging new technology breakthroughs for the enterprise . Both in your time at Oracle and now CEO of Google Cloud , this is something that you 've been doing for quite some time now . When you sort of zoom all the way out , what do you think are some of the things that have some of your principles , or some of your thoughts and enabling these technological breakthroughs and actually enabling the enterprise with them ? And what are the key insights that you have there ? Thank you . A lot of the work . So first of all , we 've really built out the organization the last three years . We 've seen a huge ramp up in our business , credit to all the people who joined us at one point over 70 % of organization that joined your in COVID . So they had n't met anybody . They could n't meet their managers , but they all did an amazing job together . The adoption of technology by companies , and I 'll give you just some elements , particularly in the application of AI in different domains that we 've seen . We work with a large financial institution in Hong Kong and Shanghai Bank , which uses our machine learning to detect fraud . You know , fraud detection and banking , there 's a lot of false positives , which makes it hard to really , you know , to a very expensive people doing something called anti-money laundering . And our AI algorithms are really able to be super precise on detection . Explainability is a critical thing there , right ? So people ask , why did you , why did you approve , why did you flag this one and not that one ? Because regulators are involved . So explainability becomes a big deal . We help , we help renewal , for example , monitor all of the factories . The process roughly , a billion data sets every day . Obviously , humans can process that . But making it super simple to , and you guys have given all your expertise in labeling and other things , you would get a sense . Factory floor data is not clean data . And so you have to actually clean , imagine doing a billion data sets into an environment every single day . You have to give the data pipelines really good . And so a lot of technology work happens to make that possible for companies . Third is , if you shop at IKEA , for example , behind IKEA is systems , it 's our recommendation system . find IKEA is systems , it 's our recommendation system . And the way that people shop for furniture and products is not the same in all countries . And so how are you able to one deal with the benefits you get from a global model , but also to contextually the specific elements in each country because people have different buying habits . Those are all things that we 've learned applying our AI in different contexts in different parts of the world . Yeah . You 've sort of glossed over this , but you 've led since you took over at Google Cloud , just a meteoric growth of the platform . You know , I think the past few years , you 've tripled your sales force and ending last year , you obviously ca n't come in this , but end the last year at , I believe , 20 billion of annual revenue , which is incredible and this incredible growth journey . What do you attribute your success to ? And how do you think you 've been able to drive just to an incredible growth and success ? From our point of view , every industry , virtually in the world , is now becoming a software powered technology industry . If you talk to automobile companies , they 're increasingly vehicles are more about software than mechanical systems . If you talk to telecommunications companies , the networks are commodities unless they can make them platforms to deliver applications , so they need new ways to slice , manage the network . If you look at banks at the end of the day , they 're about all the products of a bank or data , and all of that becomes how do you differentiate in the value delivering clients through a digital medium ? Because increasingly , I 'm sure all of you look at yourselves and go when was the last time I went to a branch of a bank . So a lot of our work has been pushing the technology innovation really far , but bringing that technology super easily to people in different industries . And given the demand that people have for a hair , I really want , I need the technology to help me power my industry , the change I 'm seeing in my industry , the more accessible we can make it , the easier and the faster we get adoption , and our approach has been to be completely open . And when to be completely open . And when I say completely open , we offer every part of the stack that we have from the hardware and network to the software abstractions above to things that are more packaged because different organizations have different levels at which they have expertise and want to adopt technology . Yeah . I mean it 's been , mean it 's been obviously incredible . You know going back to AI for a second , Google , Google obviously is an early mover in AI and Google Cloud has also been through , you know , starting with TensorFlow and Vertex AI and AutoML and so many incredibly innovative technologies . And AI has been obviously kind of a buzzword for some time now within the industry . And I think we see this in use as well . The adoption has maybe been a bit slower than we would expected until now . What do you think have been the barriers to greater levels of AI adoption , greater levels of enterprise that 's in value from AI ? And what do you think the future holds ? So we 've worked with a huge number of companies doing work , having them adopt AI . A lot of the lessons we 've seen and observed from it are the barriers to adoption are rarely about the algorithm itself . It 's often the barriers to adoption about very algorithm itself . It 's often the various adoption about very different things . So when we work with customers in many , many industries , take retailers an example , and you think of a very mundane example , like recommendations , to make product discovery on the web much easier for their own products . The biggest challenges standardizing the meaning of the product and the catalog . Because unless you have a standardized definition of the products and the data behind the algorithm is clean , it 's super hard to actually get to recommendation . And so in the work we did with H & M , for example , or at Macy 's , or at IKEA , or Bloomingdale 's , a huge number of these brands , the big part of the program is actually how do you label and clean the data upfront and standardize it before you get into the algorithmic phase . So that 's one part of things we see . Second part is for large organizations to adopt AI , they have to need to integrate the results of the algorithm back into their core processes . So , you know , practical example , we work with OGE , OGE is a large , large electric producer , electricity and power producer in Europe . They are probably one of the largest renewable energy producer in the world . They use wind farms . One of the things they really struggled with was , how do you predict how much wind is going to be there three days from now ? Because the power grid requires that prediction in order to capacity plan how much power is going into the grid . So they work with us and they use our AI to do that . But that needs to be tied into how they 're telling the rest of the power sources that work on the grid . Hey , if this went to wind is coming in , here 's all the other sources in each generation . So tying it back in is not as simple as people think . And so a lot of time is that the third on the people side , there 's change management you go through to get people to trust the algorithm . So one of the things we 've done work with many banks , particularly during the pandemic , when the government issued small business loans . There was a giant bottleneck in being able to get loans out to individual consumers . And frankly , because the banks did n't want to bring a huge army of loan officers in , they had to use software and algorithms to process it . Now the challenge people had is they needed to trust the algorithm was being fair in saying yes to some and no to others and that it would mirror for example the recommendations that their best mortgage bankers would do , right ? Just as a loan office as we do . So it gave them the benefit of scale because we processed literally millions and millions of mortgages through our technology , but it required them to get comfortable that things like fairness and other things were working . So often when people look at AI , they think it 's a skills issue . There 's certainly a skill issue involved . There 's not enough talent in the ecosystem . But things are getting easier and easier as the models get more and more sophisticated . Often people forget about these other issues that are important in getting adoption . Yeah . I mean , you 're preaching the choir when you mention the data challenges that all these enterprises face and how critical that is to getting working in the early days . One of the things that I think is interesting about Google Cloud strategies that you really have products at different layers of the stack and different layers of closest to the bare metal all the way up to these package solutions . In what way do you think that the enterprise world and even the broader business world is going to adopt these AI technologies ? Do you think that the end state is that a lot of them are using your lower level , more infrastructure ? Products , or do you think that many of them are going to adopt solutions ? How do you think this plays out over the next few years ? So we offer four layers of technology for people . There 's a set of people who say , look , I just need your computational infrastructure , your large systems . We build something called tens of processing unit , which is our large scale systems . We 're also working with Crossing Unit , which is our large-scale systems . We 're also working within video to build a really high-scale GPU Bay system . But many people , some customers say , look , I just need access to that . And we make that available because the TPUs are what we use within Google . And we make that available along with the compilation software to optimize models on the TPUs . Take as an example , LG , the Korean company that makes appliances , their team is built a large , I mean , multi-hundred billion parameter model , because they wanted to make that a way that people can interact with appliances without having to press buttons on them . So they built a model . They said , I just need access to your infrastructures . That 's one way we offer a peak capability . A second level is people say look , I really do n't need access to the raw infrastructure itself . What I need is the ability to build models using your platform . And so we offer a platform called Vertex and people build models and push them using our machine learning platform . And there are many , many organizations in logistics and financial services in retail and others who build their own models on top of the platform . The third is to make things even easier , we 've taken some of the core pieces , translation , documents , image processing , video . And we 've said , we can offer an auto-email based solution , which further simplifies how you use our platforms . And so for example , translation , we have a capability to handle translation in 135 languages . One of the important things that people ask when they go to many languages is if you look at the data sets that I used to train models , they are primarily , there 's a large set in English , because you have the whole internet is primarily in a very small number of languages . But once you get to more narrow languages , for instance , Swahili or some of the African languages , or even in Asia , there are many languages , even from where I grew up in India . There are languages that are not as widely represented on the internet . Can your model in translation provide equivalent fidelity in sparse languages ? Because it 's always important to those people only understand that language that they get a high fidelity result . So we 've built something called translation hub and it 's being used in very mundane places but with extraordinary impact . For example , when people announce COVID guidelines or recently monkey parks , for example , which is another thing , they needed translate many , many languages . And normally the process would take a long time . We have movie studios , for example , in a different example , saying , hey , when we launch a movie , we have a high fidelity set of languages , we 're actually going to hold the movie up and show that people do it . But for the long tail , we just need captioning . We 're not necessarily going to do voice dubbing . We 're going to do captioning . And they use our translation solutions to go to that . Even within companies , every medicine , for example , uses it to translate all their instruction manuals into many languages for their technicians . And then lastly , in some places , there are companies like retailers who tell us , look , a handful of the largest retailers may build their own software teams . But some of us who are small merchants , we 're not software companies . And telling us , you 've got to be a software company to use AI is not fair . So for some industries , we actually build fully packet solutions . If you call many telephone companies , the context center , behind it , sits our voice agent . And the rationale behind that was super simple , when a new smartphone launches like an iPhone or a Pixel , typically in the morning of the launch , some of these contact centers get three , four million calls in an hour . And it 's hard to hire that many agents to handle the phones . So we said , why would n't software be able to handle it ? We then evolved it so that the natural language interface can become actually the workflow for these organizations . But that 's a much more of a package solution so that telephone companies do n't have to have armies of data scientists to do it . So our work spans all of these because people have different needs and we find that as you improve the maturation of this and you make it more easy for people to adopt it . You will get broader proliferation and adoption of AI as a whole . Yeah , you know , you walk through so many different use cases and so many applications to the technology . I imagine one , and there 's so desperately , you know , everywhere from , you know , fraud detection to translation to translation of manuals , you know , there 's such a wide translation of manuals . There 's such a wide array of use cases . How do you all like Google Cloud think about helping businesses understand what is AI good for ? What can they use AI for ? There 's obviously such a wide diversity of different use cases , but what at a framework level do you tell them , how can I use AI within my business ? It 's a really good question . I mean , a lot of our work actually comes from clients asking us now , and that 's actually an encouraging thing . Because you know , see from up on the view , some simple things , how many of you believe in a few years ' time there 's gon na be intelligence software and non-intelligence software , right ? I mean , nobody would say in three , few years ' time , there 's going to be intelligence software and non-intelligence software . I mean , nobody would say in three , four years ' time , we 're going to write software that has not powered in some form of fashion by AI . So you know , in most companies actually , it 's really encouraging to see that they look at domain problems they 're having and say , for instance , I used to do it using a rules engine , which is an older model for defining kind of workflow within organizations . Can you apply AI to do it in a new way ? I used to do this in a specific way . I heard about image recognition . One example really fun or interesting one , US Navy , when you have corrosion on the base of ships , the old way was to lift it into dry dark and take a look at it . If you 've ever seen one of these ships , you can imagine lifting to dry dark is not an easy thing . So they said , can we fly a drone with your camera image recognition around it and detect corrosion ? And so what we 've seen is that as you lift up the capability where image , audio , text , et cetera , all these forms of input can be processed extremely accurately , most customers start figuring it out . And so they call us with , most of our work has come from customers calling us , saying , hey , I have this need . Can I apply AI to it ? And so we talk to them about how and when it makes sense to use AI . But we also talk to them about the consequences if the models are not handling things like skew in the data . How do you ensure that , for example , you 're treating fairness properly ? How do you ensure that the model is safe , etc . Yeah , I think it 's , I mean , all the use cases , the variety is incredibly exciting . It 's cool that these customers are coming to you directly with many of them . What is , again , kind of thinking bigger picture , what is machine learning an AI mean for Google Cloud on the whole over the next call 510 years ? So we feel that the boundary of what machine learning and what AI can do will change over time . When it started , it was about doing what we would call assistive things . Assistive things are where a human being is able to do it , but the computer assists the human being in some ways to do it better . Right ? So common examples people talk about is , hey , your doctor or radiologist , you used to look at x-ray images . Now , a computer is going to look at it and detect tumors , but it 's assisting you to find something that you may have done another way . So that 's the first phase and a lot of the work we see is primarily in that phase today . The second phase is to do something where you could n't do it with a human because the quantity of data you need to process or the amount of people you need would be just far too significant . And so the machine is doing something that humans could n't do , but it 's still an incremental element on top of what humans could do themselves . The third phase , I think , is where we think generative AI , for example , goes , because it 's about enabling people to express themselves in a different way , and to assist them in expressiveness . So I 'll give you a practical example . A lot of you probably use tools , slides , and things like that in your day to day job . PowerPoint was invented a long time ago and was really just about drawing things . You know , I 've got a 14 year old . And so if you look at the younger generation , if you look at what slides were , they were really tools to help people draw . And then to take what was on the slide projector and presented . Then the younger generation says , hey , I do n't want to draw things that 's really old-fashioned . I 'm going to go to the internet and copy images , right ? Because when they do class projects , they 're copying images into the slides . And then , as people observe , you know , on the social media environment , people going from text , which may have been Facebook to short images , which is Instagram to short video TikTok , we would say , hey , why would n't we be able to record short video ? And be used that as a mechanism to share . But recording short video is still capturing the real world through the lens of the camera . What people want is a more expressive way of saying , I have an idea , can I translate it ? And it may not be something I can capture . Imagine a kid in California and a school saying saying I want to capture how the landscape and outside of Paris and France is right now . I think they need to be able to generate some of the ideas that they could capture by physically being there . And so we 're working on all of this and we 're bringing some of these into our products to change what people could possibly do through the application of AI so they improve expressiveness for people . And so every boundary as the technology gets more sophisticated we think it moves from just assistance to assistance on things that human beings may not have been able to just linearly do to now things like expressiveness , which is a very different capability than people could actually do themselves . Yeah , I mean , all of this is very obviously incredibly exciting and we 're all watching it happen in real time . There 's an artist who actually described the image generation models as , he sort of image generation models as he was , he sort of said like , you kind of think about like a camera . Like it 's a new tool that allows you to create fundamentally new forms of art . That 's right . Yeah . And not just one medium of art , right ? Because if you look in the past , people said , you were a painter , you were a sculpture , you were a musician , and now these technologies allow you to blend all of it as a form of expressiveness . Yeah . You know , the last question I have for you is , you know , you obviously sit down with many of the sort of leading CEOs and business leaders of of the sort of largest organizations in the world . And I 'm sure one thing that is on many of their minds is sort of as AI technology develops and it continues to progress is potential disruption that might come from art of film intelligence . What sort of , how do you approach that conversation ? What sort of your advice to these business leaders who are looking at this powerful new technology and thinking about what that might mean for the businesses and the business landscape . When we talk to CEOs , I mean the biggest things we talk to them about number one , productivity in the long term , productivity has always been the primary driver of improving both company productivity , meaning their own companies , as well as societal benefit , things like affluence of a society , etc . And the means and equality of distribution of income to people across all spectrum society . Eventually , the most important metric , and you can look at any economic textbook is productivity . Software and technology has probably been the biggest boomer productivity over the last 30 , 40 years . This is the next evolution of that . And so we always say , if you approach it the right way , for example , labor shortages are going on right now . The biggest potential benefit is the application of some of these platforms like AI to do in that . The second , with any technological generation revolution , like artificial intelligence , but if you went back in time and looked at the industrial revolution , etc . There are always during the period of transition , anxiety about the consequences of that technology . And it does n't mean the technology by itself is good or bad . It 's the application of the technology that 's good or bad . So it 's incumbent upon both the technology providers and the users of the technology to ensure that the negative consequences of it are managed properly . Right ? The obvious example is , for instance , if you look at a very simple thing , image recognition . Image recognition can help doctors find tumors way better than having the best radiographer . It 's a system in that context and it 's like helping people with a better quality microscope than they had before . Object recognition is helping people find , for example , people who are in the ocean much more accurately so the coastguard can rescue them . At the same time , being able to use a camera and say that 's Thomas Korean has , you know , a lot of potential negative consequences . And so as a provider of technology , we at Google have chosen not to do that third part . But we also tell companies , it 's important not just to say , this is what 's regularly allowed by the legal framework , because law in many countries is not yet keeping up with how fast AI technology is moving . But to take the responsibility as a company CEO to say , here 's what I believe comfortable with , and here 's what I wo n't be comfortable with . Yeah . Well , Thomas , thank you so much for such incredible conversations . I think I 'm very heartened to hear all the incredible work that Google Cloud is doing to make artificial intelligence accessible to the entire business world and all of every enterprise around the globe . And I 'm so excited that you 're able to join us . Thank you so much . Thank you so much for having me . Thank you . Thank you . \ No newline at end of file diff --git a/server/tests/evaluate/predicted_texts/pred_sample_2.txt b/server/tests/evaluate/predicted_texts/pred_sample_2.txt new file mode 100644 index 00000000..ca15f6c5 --- /dev/null +++ b/server/tests/evaluate/predicted_texts/pred_sample_2.txt @@ -0,0 +1 @@ + Well, health technologies ticker civil W-E-L-L. on the TSX recently reported it's 2023 Q1 results beating the streets consensus estimate for revenue and adjusted EBITDA. And in a report. you this week. Raymond James, and we'll say quote, we're impressed by Wells capacity to drive. powerful growth across its diverse business units in the absence of M&A. me today, you'll see your Honour Tribazi to, okay, what's next for well health. Good to see you Sir, how are you? Great to see you Richard. Thanks very much for having me. Great to have you. congratulations on your 17th consecutive quarter of record grab you. And you share some insights into what's driven these results historically and in the past Porter was found. Yeah, thank you. We were very excited about our Q1220. three results. And as you mentioned, we've had a long, you know, successful. stream of continued growth and record growth. also had accelerating organic growth. And I think a big part of the success of our franchise here is the incredibly sticky and predictable revenue that we have. Well over 90% of our business is either highly vio-curring as in. the, you know, highly predictable results of our two-sided network of patients. of providers or truly recurring as in schedule or subscribe. revenues. And this allows us to essentially make sure that you know we're on track it obviously you know like any other business things happen. And sometimes it's hard to meet those results, but what's really being unique about our platform is do you have exposure to all kinds of different aspects of health care? You know, we have primary care. care and specialized care on both sides of the border in the U.S. and Canada. So we have exposure to different types of care. It's a business model, we have exposure to the US payer network, which has higher per unit economics. and Canada and of course the stability and and sort of of higher fidelity kind of collections and revenue cycle process the candidate has over United States where you don't have to kind of deal with all of that payment noise. just a lot of, I'd extract built into the platform because of the diversity of different health care. businesses that we support. And Where do you see Wells future growth coming from which part of the business? excites you the most right now. Yeah, we'll look the centrifugal force of well is health care provider, and we exist to tech, enable, and amelor the business of that tech provider. And that's what we're laser focused on, and what we're seeing is providers not wanting to this is anymore. It's very simple. And so we have a digital platform and providers can either acquire what they want and be from our digital platform and implement themselves or they can decide that they don't want to run a business anymore and they don't want to configure and manage technology, which is becoming a bigger and bigger part of the world every single day. day and when we see what we've seen with that dynamic is that it's a lot of them are now just wanting to work in a place where all the technologies configured for them. It's wrapped around them and they have a competent operating partner. that is supporting the practice and take care. can care of the front office in the back office so that they can focus on providing care. This results in them seeing patients and being happier because, you know, they became doctors to see patients, not so they can manage workers and deal with HR issues and deal with lack of And all that kind of stuff. Excellent. And I know too that acquisition supply to keep roll it in well. Can you share it inside into our positions fit in? to Wells growth strategy. Sure, and look at 2020. 2020 and 2021 we did a lot of acquisitions and 2022 we took a bit of a breather and we really focused on it integration, and I think that's one of the reasons why you saw this accelerating organic growth. We really we're able to demonstrate that we could bring together or the different elements of our technology platform. We started to sell bundles. We started to really derive synergy. and activate, you know, more sales as a result of selling all their different products as services with one voice, with one vision. So we made it easier for providing to use their technology. And I think that was a big reason for our growth. now M&A as you know we're a capital allocation company we're never far from And so we did continue to have, you know, toughens here and there. And in fact, today, We announced that we've acquired the Alberta operations of MCI-1 health. another publicly traded company who is looking to raise funds to support their business. We're very pleased with this acquisition and it just demonstrates our continued discipline. plan. These are, you know, great primary care clinics in Canada. It right in the greater Calgary area and you know, just allows us to grow our footprint. in Alberta, which is an important province for us, and it's if you look at the price. If you look at what we're getting, you know, it's just a positive of our continued. discipline. And just, you know, a few days ago at our conference call I mentioned that we have you know a really strong line up of acquisitions and you know they're starting to I think come to fruition for us. I'm helping you on the road no question. you recently announced a new AI investment program last month. What's specific areas of health care technology your AI are you focusing on and what's the when it comes to AI. Yes, I look AI. as as as as I'm sure you're aware is it's become you know really an incredibly important topic taken in all aspects of life. of business and, you know, not just business socially as well. Everyone's talking about. this new breakthrough disruptive technology, the large language models, and gender to the AI. I mean, look, AI has been about a 80 year old overnight success. a lot of people have been working on this for a long time. Generative AI is just sort of the culmination of a lot of things coming together and working, but it is uncorrect, enormous. innovation and and we think that this there's a very good news story about this in healthcare, particularly where we were looking to look, we were looking to look. unlock the value of the data that we all produce every single day. as humans. And so we've established an AI investment program. because no one company can tackle all of these innovations themselves and what will done, too, is taken a very much an ecosystem approach by establishing its app stock. marketplace. And so we're very excited about not only allocating capital into promising young AI companies that are focused on digital health and so I'll health care problems, but also giving them access to, you know, safely purely to our provider network, to our, you know, to our outpatient clinic. which is the largest owned and operated network in Canada by far. So, And when these, and it's, it was remarkable. We, we announced this, program. We've had just in the in the first week to 10 days we've had over a 100 inbound prospects come in that wanted to, collaborate with us. And again, I don't think that's necessarily for the money, you know, we're saying we would invest in it. minimum of a quarter of a million dollars, you know, a lot of them will likely be higher than a quarter of a million dollars. So it's not life-changing money, but our structural advantages and and the benefits that we have in the well network. Those are extremely hard to come by. And I think you'll see us, you know, help some of these companies. and these succeed and they will help us drive, you know, more innovative. that helps the provider. It's speaking of this very interesting AI. I know you're coming. just launched well AI voice. This is super interesting. Tell me what it is and the impact it could have on health care provides. Yeah thanks for asking our providers are thrilled with this. We've had a number of of our own well providers testing this technology. and it really feels like magic to them. It's essentially an ambient AIS. powered scribe. So it's a service that with the consent of the party involved listens to the conversation between a patient and provider. And then essentially condenses that into a medically relevant note for the chart files. Typically that is a lengthy process, a doctor has to transcribe notes, then review those notes and make sure that a appropriate medically oriented instruction. should notice is prepared and put into the chart. And that could take, you know, sometimes more than more time than the actual consultation. time. And so we believe that on average if it's you. regularly and consistently, this can give providers back at least a third of their day. And it's just a game changer. And it's just a game changer. And it's just a game changer. And We have now gone into general release with this product. It's widely available and Canada, it has been integrated into our EMR, which makes it even more valuable tools like this are going to start popping up, but if they are not integrated into your practice management system, then you have to kind of have data in in more than one place and move that around a little bit, which makes it a little bit more defensive. difficult, especially with HIPAA requirements and regulations. So again, I think this is the first of many types of different products and services that allow doctors to place more emphasis and focus on the patient and experience instead of having their head in a laptop and looking at you once to the wild, they'll be looking at you and speaking to their practice management system. And I think this, you know, think about it as a lot. except for for doctors, you know, this disability to speak. and have, you know, voice driven AI assistant that does things like this, I think are going to be incredibly helpful and valuable for healthcare providers. super fascinating. I mean we're just hearing you know more about AI maybe AI the first time, but here you are with the product already on the market and in the health care field. That's got to be pretty attractive to the out there right ahead of many other people, right? Thank you Richard. Thanks for that recognition that that's being our intention. We we want to demonstrate that we're all in on. ensuring that technology, the benefits providers is, is, is, is, is, accelerated and de-risk and provided, you know, and in a timely way, you know, providers need this help, we have a health care crisis in the country that is generally characterized as a lack of doctors. And so imagine if we can get our doctors to be 20 or 30% more productive. through the use of these types of tools. Well, they're going to see more pickations. And that's going to help all of us and and look if you step back well as be This model is all about having exposure to the success of doctors and doing our best to help them be more successful because we're going to revenue share relationship with most of the doctors with. And so this is good for the ecosystem. It's great for the provider and it's great for well as well. Super fascinating, Hamed Shabazi CEO, well technologies, ticker, W-E-L-L, great to catch up again. Thank you, sir. Thank you Richard appreciate how you having this \ No newline at end of file diff --git a/server/tests/evaluate/predicted_texts/pred_sample_3.txt b/server/tests/evaluate/predicted_texts/pred_sample_3.txt new file mode 100644 index 00000000..57d1582d --- /dev/null +++ b/server/tests/evaluate/predicted_texts/pred_sample_3.txt @@ -0,0 +1 @@ + medicine is hard work, as most of them sit easy. It takes your lectures and notes to create a Personalized study plan with exclusive videos, practice questions, and flashcards. and so much more. Try it free today. In diabetes Melodies, your body has trouble moving glucose, which is a type of sugar from your blood into your cells. This leads to high levels of glucose in your blood and not enough of it in your cells. And remember that your cells need glucose. glucose as a source of energy. So not letting the glucose enter means that the cell's star for energy. this fight having glucose right on their doorstep. In general, The body controls how much glucose is in the blood relative to how much gets into the cells with two insulin and boogogon insulin is used to reduce bloke glucose levels and glucogannas used to increase bloke glucose levels. Both of these hormones are produced by clusters of cells in the pancreas called eyelets of longer Insulin is secreted by beta cells in the center of these islands, and Lukagan is secreted by Elfisels in the periphery of the islands. the amount of glucose in the blood by binding the insulin receptors embedded in the cell membrane is insulin-responseed tissues, like muscle cells in adipose tissue. When activated, the insulin receptors cause vesicles containing glucose transporter that are inside the cell to fuse with the cell membrane, allowing glucose to be transported into the cell. Who could go on does exactly the opposite? It raises the boy group of glucose levels by getting the liver to generate new molecules of glucose from other molecules. And also, break down glycogen into glucose so that it can all get dumped into the blood. Diabetes notice is diagnosed when blood glucose levels get too high and this is seen among 10% of the world population. There are two types of diabetes, type 1 in type 2. And the main difference between them is the underlying mechanism that causes the blood glucose level. to rise. About 10% of people with diabetes have type 1 and the remaining 9% 90% of people with diabetes have type 2. Let's start with type 1 diabetes. some times just called type 1 diabetes. In this situation the body doesn't make enough insulin. The reason this happens is that in type 1 diabetes there's type 4 hypersensitivity response or a cell mediated immune response, where a person own T-cells attack the pancreas. As a quick review, remember that the immune system has T cells that react to all sorts of antigens, which are usually small peptides. polysaccharides or lipids. And that's some of these antigens are part of our own body's cell. It doesn't make sense to allow T-cells that will attack our own cells to hang around. until there's this process to eliminate them called self-tolerance. In type 1 diabetes, there's a genetic abnormality that causes a loss of self-tolerance T cells that specifically target the beta cell antigens losing cell Lawrence means that these T-cells are allowed to recruit other immune cells, and coordinate on these beta cells, losing beta cells means less insulin. and less insulin means that glucose piles up in the blood, because it can't enter the body's cells. One really important group of genes involved in regulation of the immune response is the human glucoseide antigen system, or HLA system. Even though it's called a system, it's basically this group of genes on chromosome 6 that encode the major histocompatibility. complex or MHC, which is a protein that's extremely important in helping the immune system recognized foreign molecules, as well as maintaining self-tolerance. MHC is like the serving platter that antigenes are presented to the immune cells on. Interestingly, people with Type 1 diabetes often have specific HLA genes in common with each other. One called HLADR3 and another called HLADR4. But this is just a genetic clue, right? Because not every one of HLADR3 and HLADR3 and LATR4 develops diabetes. In diabetes mode, it's type 1. destruction of beta cells usually starts early in life. But sometimes up to 90% of the beta cells are destroyed before symptoms crop up. control diabetes that all sound similar are polyphasia, like osteoorosis. polyureka and polydipsia. Let's go through them one by one. Even though there's a lot of glucose in the blood, it cannot get into the cells, which leaves cell star for energy. So in response, Adobos tissue starts breaking down fat. called like policies and muscle tissue starts breaking down proteins. Both of which results in weight loss for someone with uncontrolled diabetes. This catabolic state leaves people viewing hungry. Also known as Polyphasia. Phasia means eating and Poly means Now with high glucose levels that means that when blood gets filter through the kidneys, some of it starts to spill into the urine, called glycosurio. first of glucose and urea the urine. Since glucose is asthma automatically active, water tends to follow it, resulting in an increase in urination or polyurethane Allie again refers to a lot and you're yeah again refers to urine Finally, because there's so much urination, people with uncontrolled diabetes. become dehydrated and thirsty, or polydipsia. Poly means a lot. dipcr means thirst. Even though people with diabetes are not able to to produce their own insulin, they can still respond to insulin. So treatment involves life long. insulin therapy to regulate their blood glucose levels and basically enable their cells to use glucose. One really serious complication with type one diabetes is called diabetic keto acidosis or D.K.A. Don't understand it, let's go back to the video. the process of light policies, or fat is broken down into free fatty acids. After that happens, the liver turns the fatty acids into ketone bodies. Like a silo, a silo. acid and beta hydroxypeteric acid. A cdoc acid is a keto acid. because it has a ketone group and a carboxylic acid group. Beta hydroxy beturus. acid on the other hand, even though it's still one of the ketone bodies, isn't technically a keto acid. so that's keytown group has been reduced to a hydroxyl group is keytown bodies are important because they can be used by cells for energy, but they also increase the acidity of the blood, which is why it's called keto acidosis, and the blood coming really acidic can have major effects throughout the body. Individuals can develop a small respirator. which is a deep and labored breathing as the body tries to move carbon dioxide out of the blood. In an effort to reduce its acidity, cells also have a transport order that exchanges hydrogen ions or protons for potassium. When the blood gets acidic, it's by definition loaded with protons, like it's said in the cells while potassium gets sent into the fluid outside cells. And keep in mind is that in addition to helping glucose enter cells, insulin stimulates async ATPases, which help potassium get into the cells, and so without insulin more potassium stays in the fluid outside cells. Both of these mechanisms lead to increased potassium in the fluid outside cells, which quickly makes it into the blood and causes hyperphysis. The potassium is then extruded, so over time, even though the blood potassium levels remain high. Overall stores of potassium in the body, which include potassium inside cells starts to run low. Individuals will also have a high anion gap, which reflects a large difference in the unmeasured negative and positive ions in serum largely due to the buildup of keto acids, diabetic keto Asadosis can happen even in people who've already been diagnosed with diabetes and currently have some sort of of insulin therapy. This is up in that frame, which in turn stimulates the release of Glucogon. Too much Glucogon. can tip the delicate hormonal balance of Boopagonan insulin in favor of elevating blood sugars and can lead to a cascade of events we just described. Increase glucose in the blood. loss of glucose in the urine, loss of water, dehydration, and in parallel and need for alternative energy, generation of ketone bodies, and keto acidosis. Interestingly, both ketone bodies break down into acetone, and escape as a gas by getting breathed. out the lungs, which gives us sweet, fruity smell to a person's breath. Although that's the only sweet thing about this illness, which also causes nausea, vomiting and if severe, mental status changes in acute cerebral edema. Treatment of a DKA episode involves giving plenty of fluids, which helps with dehydration. insulin which helps lower blood glucose levels and replacement of electrolytes like potassium. All of which help to reverse the acidosis. Now let's switch gears and talk about type 2 diabetes, which we're at the body makes instance. but the tissues don't respond as well to it. The exact reason why cells don't respond isn't fully understood. Essentially the body is providing the of insulin, but the cells don't move their glucose transporters to their membrane in response. Which remember is needed for the glucose to get into the cells? These cells therefore have insulin resistance. Some risk factors for insulin resistance are obesity. city, lack of exercise, and hypertension. The exact mechanisms are still being explored. For example, an excess of adipose tissue or fat. It's not to cause the release of free fatty acids and so-called adabokines, which are signaling fuel-second cause inflammation, which seems related to insulin resistance. However, many people that are obese are not diabetic, so genetic will play a major role as well. We see this when we look at twin studies as well. We're having a twin with Type II Diabetes increased at the risk of developing Type II Diabetes. completely independently of other environmental risk factors. And type 2 diabetes. Since tissues don't respond as well to normal levels of insulin, the body ends up producing more insulin in order to get the same effect and move glucose out of the blood. through beta cell hyperplasia and increased number of beta cells. and beta cell hypertrophy, where they actually grow in size. All in this attempt to pump out more insulin. This works for a while and by keeping insulin levels higher than normal, but glucose levels can be kept normal, called normal glycemia. Now, along with insulin, beta cells also secret eyelet and alloy polypeptide. So while Beta cells are cranking out insulin, they also secrete an increase. the amount of amuline. Over time, amuline builds up and aggregates in the islands. This beta cell compensation, though, is not sustainable. and over time those maxed out beta cells get exhausted, they become dysfunctional, and high-potrophy and get smaller, as well as high-boa-plasia and die-off. As beta cells are lost in insulin levels decrease, glucose levels in the blood start to increase in patients develop hyper glycemia, which leads to similar clinical signs that we mentioned before, like Polyphasia, Bikosuria, Polyuria. and polydipsia. But unlike type 1 diabetes, there's generally some circulating insulin and type 2 diabetes from the beta cells that are trying to compensate for the insulin resistance. This means that the insulin book gun balances such that diabetic ketoacidosis is not usually developed. Having said that, a complication called hyper-osmola hyper glycemic state, or HHS, is much more common in type 2 diabetes than type one diabetes and it causes increased plasma ultimately due to extreme dehydration. and concentration of the blood. To help understand this remember that glucose is a pull their molecule that cannot passively diffuse across cell membranes, which means that it is a solid. So when levels of glucose are super high in the blood, meaning it's a high for Osmolir State, water starts to leave the body cells and enter the blood vessels. Even the cells relatively dry and trival, rather than plump in juicy. blood vessels that are full of water lead to increased urination and total body dehydration. This is a very serious situation because the dehydration of the body cells and in particular the brain can cause a number of symptoms, including mental status changes. In HHS, you can sometimes see mild hedonemia and acidosis. but not to the extent that it's seen in DKA. And in DKA, you can see some hyper-oscalary. So there's definitely overlap between these two syndromes. type 1 and type 2 diabetes are also a couple other sub types of diabetes melodies. The stational diabetes is when pregnant women have increased blood glucose, which is particularly during the third trimester. Although ultimately unknown, because it's thought to be related it's a pregnancy hormones that interfere with insulin action on insulin receptors. Also, sometimes you build and develop drug-induced diabetes, which is where medications have side effects that tend to increase blood glucose levels. for both of these is thought to be related to insulin resistance, like type 2 diabetes. an autoimmune destruction process like in type 1 diabetes, diagnosing type one or type 2 diabetes is done by getting a sense for how much glucose is floating around in the blood. and has specific standards that the World Health Organization uses. and glucose tests is taken where the person doesn't eat or drink. Except the water, that's okay. for a total of eight hours and then has their blood tested for glucose levels levels of 100 milligrams per deciliter to 125 milligrams per deciliter indicates pre-dite. and 126 milligrams per dec leader or higher indicates diabetes. A non-fasting or random glucose test can be done at any time. with 200 milligrams per deciliter or higher being a red flag for diabetes. Another test is called an oral glucose tolerance test, where a person is giving glucose and then blood samples are taking at time intervals to figure out how well it's being cleared from the blood. And most importantly, interval being 2 hours later. Levels of 140 milligrams per desuiter to 109 99 milligrams per deswitter indicate pre-diabetes. 200 or above indicates diabetes. Another thing to know is that when blood the glucose levels get high, the glucose can also stick to proteins that are floating around in the blood or in cells. So that brings us to another type of test that can be done, which is the HBA1C. test, which tests for the proportion of hemoglobin and red blood cells that has glucose stuck to it. to it, of glycated hemoglobin, HPA1C levels of 5. 0.7% to 6.4% indicate pre-diabetes and 6.5% are higher indicates diabetes. This proportion of glycated hemoglobin doesn't change day to day So it gives a sense for whether the blunt glucose levels have been high over the past two to three months. Finally, we have the CPATH diet test, which tests for bi-products. of insulin production. If the level of CPF tied is low or absent, it means the pancreas the level of CPF tied is low or absent. no longer producing enough insulin, and the glucose cannot enter the cells. For type 1 diabetes, insulin is the only treatment option. For type 2. two diabetes on the other hand. Lifestyle changes like weight loss and exercise. along with the healthy diet and oral anti-diabetic medication, like met for women in several other classes. in some times to be enough to reverse some of that insulin resistance and keep blood sugar levels in However, if oral anti-diabetic medications fail, I have two different types of medications. diabetes can also be treated with insulin. Something to bear in mind is that insulin treatment comes with a risk of hypoglycemia, especially if insulin is taken without a meal. Symptoms of hypoglycemia can be mild, like weakness, hunger, and shaking, but they can progress to a loss of consciousness in seizures in severe cases. In mild cases, drinking juices or eating candy or sugar, might be enough to bring blood sugar up, but in severe cases intravenous glucose should be given as soon as possible. The FDA is also recently approved the treatment for severe hypobacemia. All right, now over time, high glucose levels can cause damage to tiny blood vessels, while the micro-abache of the turret. across this called high line arteriolosthlerosis is where the walls of the arteriolos develop deposits, which are deposits of proteins, and these make them hard and inflex In capillaries, the basement membrane can flick it, and make it difficult for oxygen. to easily move from the capillary to the tissues, causing hypoxia. One of the most significant effects is that diabetes increases the risk of medium and large arterial wall damage, and subsequent atherosclerosis, which can lead to heart attack. and strokes, which are major causes of morbidity and mortality for patients with diabetes. In the eyes diabetes can lead to retinopathy and evidence of that can be seen on to find a scopic example that shows cotton wool spots or flare hemorrhages and can eventually In the kidneys, the afferent and e-ferrent arterioles as well as the glimmerialist itself can get damaged, which can lead to a nephrodic syndrome that's slowly damaged. diminishes the kidney's ability to filter blood over time, and can ultimately lead to dialysis. Diabetes can also affect the function of nerves causing symptoms like a decrease in sensation in the toes and fingers. Sometimes called a stock and glove distribution. As well as causes the autonomic nervous system to malfunction. That system controls a number. of body functions, everything from sweating to passing gas. about the poor blood supply and nerve damage can lead to ulcers, typically on the feet. And don't heal quickly and can get pretty severe and need to be amputated. These are some of the complications of uncontrolled diabetes, which is why it's important to diagnose and control diabetes. IPDs through a healthy lifestyle, medications to reduce insulin resistance. and even insulin therapy if beta cells have been exhausted. Well, type 1 diabetes kidney. not be prevented. Type 2 diabetes can. In fact, many people diabetes can control their blood sugar levels really effectively and live a full and active life without any of the complications. patients. Thanks for watching. If you're interested in the deeper dive on this topic. Take a look at us most is.org where we have flashcards, questions, and tools to help you learn medicine. \ No newline at end of file diff --git a/server/tests/evaluate/reference_texts/ref_sample_1.txt b/server/tests/evaluate/reference_texts/ref_sample_1.txt new file mode 100644 index 00000000..341d622f --- /dev/null +++ b/server/tests/evaluate/reference_texts/ref_sample_1.txt @@ -0,0 +1,1544 @@ +CEO of Google cloud and Alexander Wang + +CEO and founder of scale AI Thomas + +joined Google in November 2018 as the + +CEO of Google Cloud prior to Google + +Thomas spent 22 years at Oracle where + +most recently he was president of + +product development before that Thomas + +worked at McKinsey as a business analyst + +and engagement manager his nearly 30 + +years of experience have given him a + +deep knowledge of engineering Enterprise + +relationships and Leadership of large + +organizations Thomas's degrees include + +an MBA in administration and management + +from Stanford University as an RJ Miller + +scholar and a bsee in electrical + +engineering and computer science from + +Princeton University where he graduated + +summa laude Thomas serves as a + +member of the Stanford Graduate School + +of Business advisory Council and + +Princeton University School of + +Engineering advisory Council please + +welcome to the stage Thomas kurian and + +Alexander Wang + +[Music] + +this is a super exciting conversation + +thanks for uh thanks so much for being + +here Thomas thank you for having me you + +all just came off of uh your incredible + +Google Cloud next conference Where You + +released a wide variety of functionality + +and features and sort of new products + +across artificial intelligence but also + +across the entire sort of cloud + +ecosystem do you want to just first by + +walking through uh first start by + +walking through uh all the innovations + +that that you sort of released and uh + +and what you're excited about when it + +comes to Google Cloud + +you know our vision is super simple if + +you look at + +what smartphones did for a consumer you + +know they took + +a computer + +an internet browser a communication + +device and a camera and made it so that + +it's in everybody's pocket so it really + +brought computation to every person + +we feel that you know our our what we're + +trying to do is take all the + +technological innovation that Google's + +doing + +but make it super simple so that + +everyone can consume it and so that + +includes our global data center + +footprint + +all the new types of hardware and + +large-scale systems we work on + +the software that we're making available + +for people to do high-scale computation + +tools for data processing tools for + +cyber security + +tools for machine learning but make it + +so simple that everyone can use it + +and every step that we do to simplify + +things for people we think adoption can + +grow and so that's a lot of what we've + +done these last three four years and we + +made a number of announcements that next + +in in machine learning and AI in + +particular you know we look at our work + +as four elements + +how we take our large-scale compute + +systems that we're building for AI and + +how we make that available to everybody + +second what we're doing with the + +software stacks on top of it things like + +Jacks and other things and how we're + +making those available to everybody + +third is advances because different + +people have different levels of + +expertise some people say I need the + +hardware to build my own large language + +model or algorithm other people say look + +I really need to use a building block + +you guys give me so third is we've done + +a lot with automl and we announced new + +capability for image video and + +translation to make it available to + +everybody and then lastly we're also + +building completely packaged solutions + +for some areas and we announced some new + +stuff so it was a busy conference but + +you know lots of exciting stuff going on + +yeah it's incredible I mean I want to + +zoom out for a second to start with + +which is that this is obviously not your + +first time taking and packaging new + +technology breakthroughs for for the + +Enterprise you know both in your time at + +Oracle and now CEO of Google Cloud this + +is something that you've been doing for + +quite some time now when you sort of + +Zoom all the way out what do you think + +are some of the things that have some of + +of your principles or some of your + +thoughts and enabling these + +technological breakthroughs and actually + +enabling the Enterprise with them and + +what are sort of the key insights that + +you have there thank you a lot of the + +work so first of all we've really built + +out the organization the last three + +years we've seen a huge ramp up in our + +business credit to all the people you + +know who joined us + +at one point over 70 percent of + +organizations that joined during covid + +so they hadn't met anybody they couldn't + +meet their managers but they all did an + +amazing job together + +the adoption of Technology by companies + +and I'll give you just some elements + +particularly in the application of AI in + +different domains that we've seen + +we work with a large financial + +institution in Hong Kong and Shanghai + +bank which uses our machine learning to + +detect fraud + +you know fraud detection and banking + +there's a lot of false positives which + +makes it hard to really you know to it's + +very expensive for people doing + +something called anti-money laundering + +and our AI algorithms are really able to + +be super precise on detection + +explainability is a critical thing there + +right so people ask why did you why did + +you approve why did you flag this one + +and not that one because Regulators are + +involved so explainability becomes a big + +deal + +um we helped we helped uh Renault for + +example monitor all of the factories + +they process roughly a billion data sets + +every day obviously humans can process + +that + +but making it super simple to and you + +guys had given all your expertise in + +labeling and other things you would get + +a sense Factory floor data is not clean + +data and so you have to actually clean + +imagine doing a billion data sets into + +an environment every single day you have + +to get the data pipelines really good + +and so a lot of Technology work happens + +to make that possible for companies + +um third is if you shop at Ikea for + +example behind Ikea is systems it's our + +recommendation system + +and the way that people shop for + +furniture + +and products is not the same in all + +countries and so how are you able to one + +deal with the benefits you get from a + +global model + +but also take contextually the specific + +elements in each country because people + +have different buying habits those are + +all things that we've learned applying + +our AI in different contexts in + +different parts of the world yeah you + +know you've you've you're uh you sort of + +uh glossed over this but you've LED + +since you took over at Google Cloud just + +a meteoric growth of the of the platform + +you know I think in the past few years + +you've tripled your sales force and + +ending last year you obviously can't + +comment on this but ended last year at I + +believe 20 billion uh of annual revenue + +which is which is incredible and and + +this incredible growth Journey what do + +you attribute your success to and how do + +you think you've been able to to drive + +to such an incredible incredible growth + +and success + +you know from our point of view every + +every industry virtually in the world is + +now becoming a software powered you know + +technology industry right if you talk to + +automobile companies they're + +increasingly their vehicles are more + +about software than mechanical systems + +if you talk to telecommunications + +companies their networks are Commodities + +unless they can make them platforms to + +deliver applications so they need new + +ways to slice manage the network + +if you look at banks at the end of the + +day they're about all the products of a + +bank are data and all of that becomes + +how do you differentiate in the value + +you're delivering clients through a + +digital medium because increasingly I'm + +sure all of you look at yourselves and + +go when was the last time I went to a + +branch of a bank so a lot of our work + +has been pushing the Technology + +Innovation really far but bringing that + +technology super easily to people in + +different Industries and given the + +demand that people have for a hey I + +really want I need the technology to + +help me power my industry that the + +change I'm seeing in my industry the + +more accessible we can make it the + +easier and the faster we get adoption + +and our approach has been to be + +completely open and when I say + +completely open we offer every part of + +the stack that we have from the hardware + +and network to the software abstractions + +above + +two things that are more packaged + +because different organizations have + +different levels at which they have + +expertise and want to adopt technology + +yeah yeah I mean it's been I mean it's + +been obviously incredible you know going + +back to AI for a second Google Google + +obviously is is an early mover in Ai and + +Google cloud has also been through you + +know or starting with tensorflow and + +vertex Ai and automl and so many + +incredibly Innovative Technologies and + +uh ai's been obviously kind of a a + +buzzword for some time now within the + +industry and and + +um you know I think we see this and you + +see as well the adoption has maybe been + +a bit slower than we would have expected + +until now what do you think have been + +the barriers to Greater levels of AI + +adoption greater levels of of + +Enterprises seeing value from Ai and and + +what do you think the future holds + +so we work with a huge number of + +companies doing work having them adopt + +AI + +a lot of the lessons we've seen and + +observed from it + +are the barriers to adoption are rarely + +about the algorithm itself right it's + +often the barriers to adoption about + +very different things so when we work + +with customers in many many Industries + +take retail as an example + +and you think of a very mundane example + +like recommendations to make product + +discovery on the web much easier for + +their own products the biggest challenge + +is standardizing the meaning of the + +product and the catalog Because unless + +you have a standardized definition of + +the products and the data behind the + +algorithm is clean it's super hard to + +actually get a recommendation and so in + +the work we did with h m for example or + +at Macy's or at Ikea or Bloomingdale's a + +huge number of these Brands the big part + +of the program is actually how do you + +label and clean the data up front and + +standardize it before you get into the + +algorithmic phase so that's one part of + +things we see + +second part is for large organizations + +to adopt AI they have to need to + +integrate the the the results of the + +algorithm back into their core processes + +so you know practical example we work + +with Angie Angie is a large large + +electric producer electricity and power + +producer in Europe + +they are probably the one of the largest + +renewable energy producer in the world + +they use wind farms + +one of the things they really struggled + +with was how do you predict how much + +wind is going to be there three days + +from now because the power grid requires + +that prediction in order to capacity + +plan how much power is going into the + +grid so they work with us and they use + +our AI to do that + +but that needs to be tied into how + +they're telling the rest of the power + +sources that work on the grade hey if + +this much wind is coming in here's all + +the other sources need to generate so + +tying it back in is not as simple as + +people think and so a lot of time is + +that + +the third on the people side there's + +change management you go through to get + +people to trust the algorithm so one of + +the things we've done work with many + +banks particularly during the pandemic + +when the government issued small + +business loans + +there was a giant bottleneck in being + +able to get loans out to individual + +consumers + +and frankly because the banks didn't + +want to bring a huge Army of loan + +officers in + +they had to use software and algorithms + +to process it now the challenge people + +had is they needed to trust the + +algorithm was being fair in saying yes + +to some and no to others + +and that it would mirror for example the + +recommendations that their best mortgage + +you know Bankers would do right just as + +a loan officers would do so it gave them + +the benefit of scale because we + +processed literally millions and + +millions of mortgages through our + +technology but it required them to get + +comfortable that things like fairness + +and other things were working so often + +when people look at AI they think it's a + +skills issue there's certainly a skill + +issue involved there's not enough talent + +in the ecosystem but things are getting + +easier and easier as the models get more + +and more sophisticated often people + +forget about these other issues that are + +important in getting adoption yeah I + +mean you're uh you're preaching in the + +choir when you mentioned the the data + +challenges that all these Enterprises uh + +face and uh and how critical that is to + +getting AI working in the early days + +um you know one of one of the things + +that I think is interesting about Google + +Cloud strategy is that you really have + +products that different layers of sort + +of the stack and different layers of of + +um you know closest to the bare metal + +all the way up to these package + +Solutions you know I'm with in what way + +do you think that the Enterprise world + +and even the the sort of broader + +business world is going to adopt these + +AI Technologies do you think that the + +end stated that a lot of them are using + +your lower level more infrastructure uh + +products or do you think that many of + +them are going to adopt Solutions how do + +you think this plays out over the the + +next few years + +so we offer four layers of technology + +for people + +there's a set of people who say look I + +just need your + +you know computational infrastructure + +your large systems we build something + +called tensor Processing Unit which is + +our large scale systems we're also + +working with Nvidia to build a really + +high scale gpu-based system + +but many people some some customers say + +look I just need access to that and we + +make that available because the tpus are + +what we use within Google and we make + +that available along with the + +compilation software to optimize models + +on the tpus + +take as an example of LG you know the + +the Korean company that makes appliances + +their team has built a a large I mean + +multi-hundred billion parameter model + +because they wanted to make that a way + +that people can interact with appliances + +without having to press buttons on them + +they so they built a model they said I + +just need access to your infrastructure + +so that's one way we offer capability + +a second level is people say look I + +really don't need access to the raw + +infrastructure itself what I need is the + +ability to build models using your + +platform and so we offer a platform + +called vertex and people build models + +and push them using our machine learning + +platform and there are many many + +organizations in logistics and financial + +services in retail and others who build + +their own models on top of the platform + +the third is to make things even easier + +we've taken some of the core pieces + +translation documents + +uh image processing video + +and we've said we can offer an automl + +based solution which further simplifies + +how you use our platforms + +and so for example translation we have a + +capability to handle translation in 135 + +languages + +one of the important things that people + +ask when they go to many languages is + +the if you look at the data sets that + +are used to + +to train models + +they are primarily there's a large set + +in English because you have the whole + +internet is primarily in a very small + +number of languages but once you get to + +more narrow languages for instance + +Swahili or some of the African languages + +or even in Asia there are many languages + +even from where I grew up in India there + +are languages that are not as widely + +represented on the internet can you + +model in Translation provide equivalent + +Fidelity in sparse languages because + +it's always important to those people + +who only understand that language that + +they get a high fidelity result + +so we built something called translation + +Hub and it's being used in very mundane + +places but with extraordinary impact for + +example when people announce covet + +guidelines or recently monkey pox for + +example which is another thing they need + +to translate in many many languages and + +normally the process would take a long + +time + +we have movie studios for example in a + +in a different example saying hey when + +we launch a movie + +uh we have a high fidelity set of + +languages we're actually going to hold + +the movie up and show that people do it + +but for the long tail we just need + +captioning uh we're not necessarily + +going to do voice dubbing we're going to + +do captioning and they use our + +translation solutions to go to that even + +within companies Avery Dennison for + +example uses it to translate all their + +instruction manuals into many languages + +for their technicians + +and then lastly in some places there are + +companies like retailers who tell us + +look a handful of the largest retailers + +May build their own software teams + +but some of us who are small Merchants + +we're not software companies and telling + +us you got to be a software company to + +use AI is not fair + +so for some Industries we actually build + +fully packaged Solutions if you if you + +call many telephone companies their + +contact center behind it sits a voice + +agent + +and the rationale behind that was super + +simple when a new smartphone launches + +like an iPhone or a pixel typically in + +the morning of the launch some of these + +contact centers get three four million + +calls in an hour + +and it's hard to hire that many agents + +to handle the phones so we said why + +wouldn't software be able to handle it + +we then evolved it so that the natural + +language interface can become actually + +the workflow for these organizations but + +that's a much more of a package solution + +so that telephone companies don't have + +to have armies of data scientists to do + +it so our work spans all of these + +because people have different needs and + +we find that you know as you improve the + +maturation of this and you make it more + +easy for people to adopt it you will get + +broader proliferation and Adoption of AI + +as a whole + +yeah you know you walk through so many + +different use cases and so many + +applications of the technology I imagine + +one um and they're so desperate you know + +everywhere from uh you know fraud + +detection to translation to sort of + +translation of manuals you know there's + +such a wide array of use cases how do + +you you all at Google Cloud think about + +helping businesses understand what what + +is AI good for you know what what can + +they use AI for you know there's there's + +obviously such a wide + +um uh diversity of different use cases + +but what at a framework level do you do + +you tell them like how can I use AI + +within my business + +it's a really good question I mean a lot + +of our work actually comes from clients + +asking us now and that's actually + +an encouraging thing because you know + +see from our point of view some simple + +things how many of you believe in a few + +years time there's going to be + +intelligent software and + +non-intelligence software + +right I mean nobody would say in three + +four years time we're going to write + +software that has not powered in some + +form of fashion by AI so you know and + +most companies actually it's really + +encouraging to see that they look at + +domain problems they're having and say + +for instance I used to do it using a + +rules engine which is an older model for + +defining kind of workflow within + +organizations can you apply AI to do it + +in a new way + +um I used to do this in a specific way I + +heard about image recognition but you + +know one example really fun or + +interesting one U.S Navy + +um when you have corrosion on the base + +of ships the old way was to lift it into + +Dry Dock and take a look at it if you've + +ever seen one of these ships you can + +imagine lifting into dry dock is not an + +easy thing so they said can we fly a + +drone with Geo camera image recognition + +around it and detect corrosion and it's + +so the what we've seen is that as you + +lift up the capability where image audio + +text Etc all these forms of input + +can be processed extremely accurately + +most customers start figuring it out and + +so they call us with most of our work + +has come from customers calling us + +saying hey I have this need can I apply + +AI to it and so we talk to them about + +how and when it makes sense to use AI + +but we also talk to them about the + +consequences if the models are not you + +know handling things like skew in the + +data how do you ensure that for example + +you're treating fairness properly how do + +you ensure that the model is safe etc + +etc + +yeah you know I think uh it's it's + +exciting I mean all the use cases the + +variety is is incredibly exciting it's + +cool that these customers are coming to + +you + +um directly with many of them what what + +is you know again kind of uh thinking + +bigger picture what is machine learning + +and AI mean for Google Cloud on the + +whole over the next call it five ten + +years + +so we feel that the boundary of what + +machine learning and what AI can do will + +change over time + +uh when it's started it was about doing + +what you know what we would call + +assistive things + +assist if things are where a human being + +is able to do it but the computer + +assists the human being in some ways to + +do it better right so common examples + +people talk about is hey you're a doctor + +or radiologist + +you used to look at x-ray images now a + +computer is going to look at it and + +detect tumors but it's assisting you to + +find something that you may have done + +another way + +so that's the first phase and a lot of + +the work we see is is primarily in that + +phase today + +the the second phase is to do something + +where you couldn't do it with a human + +because the quantity of data you need to + +process or the amount of people you need + +would be just far too significant and so + +the machine is doing something that + +humans couldn't do but it's still an + +incremental element on top of what + +humans could do themselves + +the third phase I think is where we + +think generative AI for example goes + +because it's about enabling people to + +express themselves in a different way + +right and to assist them in + +expressiveness so I'll give you a + +practical example a lot of you probably + +use tools uh like slides and things like + +that in your day-to-day job right + +PowerPoint was invented a long time ago + +and was really just about drawing things + +you know I've got a 14 year old and so + +if you look at the younger generation + +if you look at what slides were they + +were really tools to help people draw + +and then to take what was on the slide + +projector and present it + +then P you know the the younger + +generation says hey I don't want to draw + +things that's like really old-fashioned + +I'm going to go to the internet and copy + +images right because they when they do + +class projects They're copying images + +into the slides + +and then you know as as people observe + +you know on the social media environment + +people going from text which may have + +been Facebook to short to images which + +is Instagram to short video Tick Tock + +people say hey why wouldn't we able to + +record short video and we use that as a + +mechanism to share but recording short + +video is still capturing the real world + +through the lens of the camera + +what people want is a more expressive + +way of saying I have an idea can I + +translate it and it may not be something + +I can capture imagine a kid in + +California in a school saying I want to + +capture how + +the landscape and outside of Paris and + +Francis right now I think they need to + +be able to generate some of the ideas + +that they couldn't capture by physically + +being there and so we're working on all + +of this and we're bringing some of these + +into our products to change what people + +could possibly do through the + +application of AI so they improve + +expressiveness for people + +and so every boundary as the technology + +gets more sophisticated we think it + +moves from just assistance to assistance + +on things that human beings may not have + +been able to just linearly do + +to now things like expressiveness which + +is a very different capability than + +people could actually do themselves + +uh yeah it's an it's I mean all this is + +very is obviously incredibly exciting + +and we're all watching it happen in real + +time you know there's an artist uh who + +actually described the these sort of + +image generation models as he was he + +sort of said like you kind of have to + +think about like a like a camera like + +it's a new tool that allows you to + +create fundamentally new uh you know + +forms of art that's right yeah and not + +just one medium of art right because if + +you look in the past people said you + +were a painter you were a sculpture + +you're a musician and now these + +Technologies allow you to blend all of + +it as a form of expressiveness yeah you + +know the the last question I have for + +you is you you know you obviously sit + +down with many of the sort of leading + +CEOs and Business Leaders of many of the + +the sort of largest uh organizations in + +the world and I'm sure one thing that is + +on many of their minds is sort of um as + +AI technology develops and it continues + +to progress is potential disruption that + +might come from from artificial + +intelligence what sort of how do you + +approach that conversation what's sort + +of your advice to these these Business + +Leaders who are looking at this powerful + +new technology and thinking about what + +that might mean for for the businesses + +and and the business landscape + +when we talk to CEOs I mean the biggest + +things we talk to them about number one + +you know uh productivity in the long + +term + +productivity has always been the primary + +driver of improving you know both + +company productivity meaning their own + +companies as well as societal you know + +benefit things like affluence of a + +society Etc and the means and equality + +of distribution of income to people + +across all Spectrum Society eventually + +the most important metric and you can + +look at any economics textbook is + +productivity + +uh software and technology has probably + +been the biggest Boon of productivity + +over the last 30 40 years + +this is the next evolution of that and + +so we always say if you approach it the + +right way for example labor shortages + +are going on right now + +the biggest potential benefit is the + +application of some of these platforms + +like AI to doing that + +the second + +with any technological generation + +Revolution like artificial intelligence + +but if you went back in time and looked + +at the Industrial Revolution Etc they're + +always During the period of transition + +anxiety about the consequences of that + +technology + +and it doesn't mean that technology by + +itself is good or bad it's the + +application of the technology that's + +good or bad + +so it's incumbent upon both the + +technology providers and the users of + +the technology to ensure that the + +negative consequences of it are managed + +properly right + +the obvious example is for instance if + +you look at a very simple thing image + +recognition + +image recognition can help doctors find + +tumors way better than having the best + +radiographer + +it's assistive in that context and it's + +like helping people with a better + +quality microscope than they had before + +object recognition is helping people + +find for example people who are in the + +ocean much more accurately so the Coast + +Guard can rescue them + +at the same time being able to use a + +camera and say that's Thomas kurian + +has uh you know a lot of potential + +negative consequences and so as a + +provider of Technology we at Google have + +chosen not to do that third part but we + +also tell companies it's important not + +just to say this is what's regulatory + +Allowed by the legal framework because + +law in many countries is not yet keeping + +up with how fast AI Technologies is + +moving but to take the responsibility as + +a Company CEO to say here's what I'd be + +comfortable with and here's what I won't + +be comfortable with yeah well Thomas + +thank you so much for uh such an + +incredible conversations I think uh I + +think I'm I'm very heartened to hear all + +the incredible work that Google cloud is + +doing to make artificial intelligence + +accessible to you know the entire + +business world and all of every + +Enterprise around the globe and uh I'm + +so excited that you're able to join us + +thank you so much thank you so much for + +having me + +[Music] + diff --git a/server/tests/evaluate/reference_texts/ref_sample_2.txt b/server/tests/evaluate/reference_texts/ref_sample_2.txt new file mode 100644 index 00000000..f407f205 --- /dev/null +++ b/server/tests/evaluate/reference_texts/ref_sample_2.txt @@ -0,0 +1,620 @@ +Technologies ticker symbol w-e-l-l on + +the TSX recently reported its 2023 q1 + +results beating the streets consensus + +estimate for revenue and adjusted ebitda + +and in a report issued this week Raymond + +James analyst said quote we're impressed + +by Wells capacity to drive powerful + +growth across its diverse business units + +in the absence of M A joining me today + +is CEO Hamed chabazi to look at what's + +next for well health good to see you sir + +how are you great to see you Richard + +thanks very much for having me great to + +have you uh congratulations on your 17th + +consecutive quarter of record Revenue + +can you share some insights into what's + +Driven these results historically and in + +the past quarter as well + +yeah thank you we we're very excited + +about our uh q1 2023 results and as you + +mentioned uh we've had a long you know + +successful uh string of of uh you know + +continued growth and record growth + +um we also had accelerating organic + +growth and I think um a big part of the + +success of our franchise here is the + +incredibly sticky and predictable + +Revenue that we have you know well over + +90 of our business is either highly + +reoccurring as in uh the you know highly + +predictable uh results of our two-sided + +network of patients and providers or + +truly recurring as in scheduled or + +subscribed revenues and this allows us + +to essentially make sure that that uh + +you know we're on track it obviously you + +know like any other business things + +happen uh and sometimes it's hard to + +meet those results but what's really + +being unique about our platform is we do + +have exposure to all kinds of different + +aspects of healthcare you know we have + +Prime primary care and Specialized Care + +on both sides of the Border in the US + +and Canada so we have exposure to + +different types of business models we + +have exposure to the U.S payer Network + +which has higher per unit economics than + +Canada and of course the stability and + +uh and and sort of higher Fidelity uh + +kind of Collections and revenue cycle + +process that Canada has over the United + +States where you don't have to kind of + +deal with all of that uh at that payment + +noise so just a lot of I think strength + +built into the platform because of the + +diversity of different Healthcare + +businesses that we support + +and uh where do you see Well's future + +growth coming from which part of the + +business uh excites you the most right + +now yeah well look the centrifugal force + +of well is the healthcare provider and + +we exist to uh Tech enable and + +ameliorate the business of that of that + +Tech of that healthcare provider uh and + +and and that's what we're laser focused + +on and and what we're seeing is + +providers not wanting to run businesses + +anymore it's very simple and so we have + +a digital platform and providers can + +either acquire what they want and need + +from our digital platform and implement + +it themselves + +or they can decide that they don't want + +to run a business anymore they don't + +want to configure and manage technology + +which is becoming a bigger and bigger + +part of their world every single day and + +when we see what we've seen with that + +Dynamic is that uh is that a lot of them + +are now just wanting to work in a place + +where where all the technology is + +configured for them it's wrapped around + +them and they have a competent operating + +partner that is supporting the organ the + +the practice uh and and taking care of + +the front office in the back office so + +that they can focus on providing care + +this results in them seeing more + +patients uh and and being happier + +because you know they became doctors to + +see patients not so they can manage uh + +workers and and deal with HR issues and + +deal with labs and all that kind of + +stuff excellent and I know too that + +Acquisitions have played a key role in + +well can you share any insights into how + +the Acquisitions fit into Wells growth + +strategy + +sure in in look in 2020 and 2021 we did + +a lot of Acquisitions in 2022 we took a + +bit of a breather and we've really + +focused on integration and I think + +that's one of the reasons why you saw + +this accelerating organic growth we + +really were able to demonstrate that we + +could bring together the different + +elements of our technology platform we + +started to sell bundles we started to + +really derive Synergy uh and activate uh + +you know more sales as a result of + +selling uh all the different products + +and services with one voice with One + +Vision uh so we made it easier for + +providers to use their technology and I + +think that was a big reason uh for our + +growth now M A as you know where Capital + +allocation company we're never far from + +it and so we did continue to have you + +know tuck-ins here and there and in fact + +today uh we announced that we've + +acquired uh the Alberta operations of uh + +MCI one Health and other publicly traded + +company uh who was looking to raise + +funds to support their business we're + +very pleased with with this acquisition + +it just demonstrates our continued + +discipline these are you know great + +primary care clinics in in Canada right + +in the greater Calgary area and uh uh + +you know just allows us to grow our + +footprint in Alberta which is an + +important Province for us and it it's + +it's if you look at the price if you + +look at what we're getting uh you know + +it's just demonstrative of our continued + +uh discipline and just you know a few + +days ago at our conference call I + +mentioned uh that we had you know a + +really strong lineup of Acquisitions uh + +and you know they're starting to uh uh I + +think uh come to fruition for us + +a company on the grown-up question I you + +recently announced a new AI investment + +program last month what specific areas + +of healthcare technology or AI are you + +focusing on and what's the strategy when + +it comes to AI + +yes uh look AI as as I'm sure you're + +aware is it's become you know really uh + +an incredibly important topic in in all + +aspects of of business and and you know + +not just business socially as well + +everyone's talking about uh this this + +new breakthrough disruptive technology + +the large language models and generative + +AI + +um I mean look AI uh has been about a 80 + +year old overnight success a lot of + +people have been working on this for a + +long time generative AI is just sort of + +you know the culmination of a lot of + +things coming together and working uh + +but it is uncorked enormous uh + +Innovation and and we think that um this + +there's a very good news story about + +this in healthcare particularly where we + +were looking to look we were looking to + +unlock uh the value of of the data that + +that we all produce every single day + +um as as humans and and so we've + +established an AI investment program + +because no one company can can tackle + +all of these Innovations themselves and + +what well has done too is it's taken a + +very much an ecosystem approach by + +establishing its apps.health Marketplace + +and so we're very excited about not only + +uh allocating Capital into promising + +young AI companies that are focused on + +digital health and solving Healthcare + +problems but also giving them access to + +um you know safely and securely to our + +provider Network to our uh you know to + +to our Outpatient Clinic Network which + +is the largest owned and operated + +Network in Canada by far uh so + +um and and when these and it's it was + +remarkable when we announced this + +program we've had just in the in the + +first uh week to 10 days we've had over + +a hundred uh inbound prospects come in + +uh that that wanted to you know + +collaborate with us and again I don't + +think that's necessarily for the money + +you know we're saying we would invest a + +minimum of a quarter of a million + +dollars you know a lot of them will + +likely be higher than a quarter of a + +million dollars + +so it's not life-changing money but but + +our structural advantages and and and + +the benefits that we have in the Well + +Network those are extremely hard to come + +by uh and I think and I think uh uh + +you'll see us uh you know help some of + +these companies uh succeed and they will + +help us drive uh you know more + +Innovation to that helps the provider + +but speaking of this very interesting AI + +I know your company just launched well + +AI voice this is super interesting tell + +me what it is and the impact it could + +have on health care providers + +yeah thanks for uh asking Richard our + +providers uh are thrilled with this you + +know we've we've had a number of of of + +our own well providers testing this + +technology and it it it really feels + +like magic to them it's essentially an + +ambient AI powered scribe so it's a it's + +a service that with the consent of the + +parties involved listens to the + +conversation between a patient and + +provider and then uh essentially + +condenses that into a medically relevant + +note for the chart files uh typically + +that is a lengthy process a doctor has + +to transcribe notes then review those + +notes and make sure that uh a a a a + +appropriate medically oriented and + +structured node is is is uh prepared and + +put into the chart and that could take + +you know sometimes more than more time + +than the actual consultation uh time and + +so we believe that on average if it's + +used regularly and consistently this can + +give providers back at least a third of + +their day + +um and and it's it's just a game changer + +uh and and uh we have now gone into + +General release with this product it's + +widely available in Canada uh it has + +been integrated into our EMR which makes + +it even more valuable tools like this + +are going to start popping up but if + +they're not integrated into your + +practice management system then you have + +to kind of have data in in more than one + +place and and move that around a little + +bit which which makes it a little bit + +more difficult especially with HIPAA + +requirements and and regulations so + +again I think this is the first of many + +types of different products and services + +that allow doctors to place more + +emphasis and focus on the patient + +experience instead of having their head + +in a laptop and looking at you once in a + +while they'll be looking at you and + +speaking to their practice management + +system and I think this you know think + +about it as Alexa for for our doctors uh + +you know this this ability to speak uh + +and and have you know uh you know Voice + +driven AI assistant that does things + +like this I think are going to be you + +know incredibly helpful and valuable uh + +for for healthcare providers + +super fascinating I mean we're just + +hearing you know more about AI maybe AI + +for the first time but here you are with + +a product already on the market in the + +in the healthcare field that's going to + +be pretty attractive to be out there uh + +right ahead of many other people right + +thank you Richard thanks for that + +recognition that's been Our intention we + +we want to demonstrate that we uh you + +know that we're all in on ensuring that + +technology that benefits providers uh is + +is is accelerated and uh de-risked and + +provided uh you know um in in a timely + +way you know providers need this help we + +we have a healthcare crisis in the + +country that is generally characterized + +as a as a lack of doctors and so imagine + +if we can get our doctors to be 20 or 30 + +percent more productive through the use + +of these types of tools well they're + +going to just see more patience and and + +that's going to help all of us and uh + +and look if you step back Wells business + +model is all about having exposure to + +the success of doctors and doing our + +best to help them be more successful + +because we're in a revenue share + +relationship with most of the doctors + +that we work with and so this uh this is + +good for the ecosystem it's great for + +the provider and it's great for well as + +well super fascinating I'm Ed shabazzi + +CEO well Health Technologies ticker + +w-e-l-l great to catch up again thank + +you sir + +thank you Richard appreciate you having + +me + +[Music] + +thank you + diff --git a/server/tests/evaluate/reference_texts/ref_sample_3.txt b/server/tests/evaluate/reference_texts/ref_sample_3.txt new file mode 100644 index 00000000..a589de5f --- /dev/null +++ b/server/tests/evaluate/reference_texts/ref_sample_3.txt @@ -0,0 +1,970 @@ +learning medicine is hard work osmosis + +makes it easy it takes our lectures and + +notes to create a personalized study + +plan with exclusive videos practice + +questions and flashcards and so much + +more try it free today + +in diabetes mellitus your body has + +trouble moving glucose which is the type + +of sugar from your blood into your cells + +this leads to high levels of glucose in + +your blood and not enough of it in your + +cells and remember that your cells need + +glucose as a source of energy so not + +letting the glucose enter means that the + +cells star for energy despite having + +glucose right on their doorstep in + +general the body controls how much + +glucose is in the blood relative to how + +much gets into the cells with two + +hormones insulin and glucagon insulin is + +used to reduce blood glucose levels and + +glucagon is used to increase blood + +glucose levels both of these hormones + +are produced by clusters of cells in the + +pancreas called islets of langerhans + +insulin is secreted by beta cells in the + +center of these islets and glucagon is + +secreted by alpha cells in the periphery + +of the islets insulin reduces the amount + +of glucose in the blood by binding to + +insulin receptors embedded in the cell + +membrane of various insulin responsive + +tissues like muscle cells in adipose + +tissue when activated the insulin + +receptors cause vesicles containing + +glucose transporter that are inside the + +cell to fuse with the cell membrane + +allowing glucose to be transported into + +the cell glucagon does exactly the + +opposite it raises the blood glucose + +levels by getting the liver to generate + +new molecules of glucose from other + +molecules and also break down glycogen + +into glucose so that I can all get + +dumped into the blood diabetes mellitus + +is diagnosed when blood glucose levels + +get too high and this is seen among 10 + +percent of the world population there + +are two types of diabetes type 1 and + +type 2 and the main difference between + +them is the underlying mechanism that + +causes the blood glucose levels to rise + +about 10% of people with diabetes have + +type 1 and the remaining 90% of people + +with diabetes have type 2 let's start + +with type 1 diabetes mellitus sometimes + +just called type 1 diabetes in this + +situation the body doesn't make enough + +insulin the reason this happens is that + +in type 1 diabetes there's a type 4 + +hypersensitivity response or a cell + +mediated immune response where a + +person's own T cells at + +the pancreas as a quick review remember + +that the immune system has T cells that + +react to all sorts of antigens which are + +usually small peptides polysaccharides + +or lipids and that some of these + +antigens are part of our own body cells + +it doesn't make sense to allow T cells + +that will attack our own cells to hang + +around until there's this process to + +eliminate them called self tolerance in + +type 1 diabetes there's a genetic + +abnormality that causes a loss of self + +tolerance among T cells that + +specifically target the beta cell + +antigens losing self tolerance means + +that these T cells are allowed to + +recruit other immune cells and + +coordinate an attack on these beta cells + +losing beta cells means less insulin and + +less insulin means that glucose piles up + +in the blood because it can't enter the + +body's cells one really important group + +of genes involved in regulation of the + +immune response is the human leukocyte + +antigen system or HLA system even though + +it's called a system it's basically this + +group of genes on chromosome 6 that + +encode the major histocompatibility + +complex or MHC which is a protein that's + +extremely important in helping the + +immune system recognize foreign + +molecules as well as maintaining self + +tolerance MHC is like the serving + +platter that antigens are presented to + +the immune cells on interestingly people + +with type 1 diabetes often have specific + +HLA genes in common with each other one + +called + +HLA dr3 and another called HLA dr4 but + +this is just a genetic clue right + +because not everyone with HLA dr3 and + +HLA dr4 develops diabetes in diabetes + +mellitus type 1 destruction of beta + +cells usually starts early in life but + +sometimes up to 90% of the beta cells + +are destroyed before symptoms crop up + +for clinical symptoms of uncontrolled + +diabetes that all sound similar our + +polyphagia glycosuria polyuria and + +polydipsia let's go through them one by + +one even though there's a lot of glucose + +in the blood it cannot get into the + +cells which leaves cells starved for + +energy so in response adipose tissue + +starts breaking down fat called + +lipolysis + +and muscle tissue starts breaking down + +proteins both of which results in weight + +loss for someone with uncontrolled + +diabetes this catabolic state leaves + +people feeling hungry + +also known as poly fascia Faiza means + +eating and poly means a lot now with + +high glucose levels that means that when + +blood gets filtered through the kidneys + +some of it starts to spill into the + +urine called glycosuria glyco surfers to + +glucose and urea the urine since glucose + +is osmotically active water tends to + +follow it resulting in an increase in + +urination or polyuria poly again refers + +to a lot and urea again refers to urine + +finally because there's so much + +urination people with uncontrolled + +diabetes become dehydrated and thirsty + +or polydipsia poly means a lot and dip + +SIA means thirst even though people with + +diabetes are not able to produce their + +own insulin they can still respond to + +insulin so treatment involves lifelong + +insulin therapy to regulate their blood + +glucose levels and basically enable + +their cells to use glucose + +one really serious complication with + +type 1 diabetes is called diabetic + +ketoacidosis or DKA to understand it + +let's go back to the process of + +lipolysis where fat is broken down into + +free fatty acids after that happens the + +liver turns the fatty acids into ketone + +bodies like Osito acetic acid in beta + +hydroxy butyrate acid a seed of acetic + +acid is a keto acid because it has a + +ketone group in a carboxylic acid group + +beta hydroxy rhetoric acid on the other + +hand even though it's still one of the + +ketone bodies isn't technically a keto + +acid since its ketone group has been + +reduced to a hydroxyl group these ketone + +bodies are important because they can be + +used by cells for energy but they also + +increase the acidity of the blood which + +is why it's called ketoacidosis and the + +blood becoming really acidic can have + +major effects throughout the body + +individuals can develop custom all + +respiration which is a deep and labored + +breathing as the body tries to move + +carbon dioxide out of the blood in an + +effort to reduce its acidity cells also + +have a transporter that exchanges + +hydrogen ions or protons for potassium + +when the blood gets acidic it's by + +definition loaded with protons that get + +sent into cells while potassium gets + +sent into the fluid outside cells + +another thing to keep in mind is that in + +addition to helping glucose enter cells + +insulin stimulates the sodium potassium + +ATPase --is which help potassium get + +into the cells and so without insulin + +more potassium stays in the fluid + +outside cells both of these mechanisms + +lead to increased potassium in the fluid + +outside cells which quickly makes it + +into the blood and causes hyperkalemia + +the potassium is then excreted so over + +time even though the blood potassium + +levels remain high over all stores of + +potassium in the body which include + +potassium inside cells starts to run low + +individuals will also have a high anion + +gap which reflects a large difference in + +the unmeasured negative and positive + +ions in the serum largely due to the + +build-up of ketoacids + +diabetic ketoacidosis can happen even in + +people who have already been diagnosed + +with diabetes and currently have some + +sort of insulin therapy + +in states of stress like an infection + +the body releases epinephrine which in + +turn stimulates the release of glucagon + +too much glucagon can tip the delicate + +hormonal balance of glucagon and insulin + +in favor of elevating blood sugars and + +can lead to a cascade of events we just + +described increased glucose in the blood + +loss of glucose in the urine loss of + +water dehydration and in parallel and + +need for alternative energy generation + +of ketone bodies and ketoacidosis + +interestingly both ketone bodies break + +down into acetone and escape as a gas by + +getting breathed out the lungs which + +gives us sweet fruity smell to a + +person's breath in general though that's + +the only sweet thing about this illness + +which also causes nausea vomiting and if + +severe mental status changes and acute + +cerebral edema + +treatment of a DKA episode involves + +giving plenty of fluids which helps with + +dehydration insulin which helps lower + +blood glucose levels and replacement of + +electrolytes like potassium all of which + +help to reverse the acidosis now let's + +switch gears and talk about type 2 + +diabetes which is where the body makes + +insulin but the tissues don't respond as + +well to it the exact reason why cells + +don't respond isn't fully understood + +essentially the body's providing the + +normal amount of insulin but the cells + +don't move their glucose transporters to + +their membrane in response which + +remember is needed for the glucose to + +get into the cells these cells therefore + +have insulin resistance some risk + +factors for insulin resistance are + +obesity lack of exercise and + +hypertension the exact mechanisms are + +still being explored for example in + +excess of adipose tissue or fat is + +thought to cause the release of free + +fatty acids in so-called edible kinds + +which are signaling molecules that can + +cause inflammation which seems related + +to insulin resistance + +however many people that are obese are + +not diabetic so genetic factors probably + +play a major role as well we see this + +when we look at twin studies as well + +we're having a twin with type-2 diabetes + +increases the risk of developing type 2 + +diabetes completely independently of + +other environmental risk factors in type + +2 diabetes since tissues don't respond + +as well to normal levels of insulin the + +body ends up producing more insulin in + +order to get the same effect and move + +glucose out of the blood + +they do this through beta cell + +hyperplasia an increased number of beta + +cells and beta cell hypertrophy where + +they actually grow in size all in this + +attempt to pump out more insulin this + +works for a while and by keeping insulin + +levels higher than normal blood glucose + +levels can be kept normal called normal + +glycemia now along with insulin beta + +cells also secrete islet amyloid + +polypeptide or amylin so while beta + +cells are cranking out insulin they also + +secrete an increased amount of amylin + +over time Emlyn builds up and aggregates + +in the islets this beta cell + +compensation though is not sustainable + +and over time those maxed out beta cells + +get exhausted and they become + +dysfunctional and undergo hypo trophy + +and get smaller as well as hypoplasia + +and die off as beta cells are lost in + +insulin levels decrease glucose levels + +in the blood start to increase in + +patients develop hyperglycemia which + +leads to similar clinical signs that we + +mentioned before like Paul aphasia + +glycosuria polyuria polydipsia but + +unlike type 1 diabetes there's generally + +some circulating insulin in type 2 + +diabetes from the beta cells that are + +trying to compensate for the insulin + +resistance this means that the insulin + +glucagon balances such that diabetic + +ketoacidosis does not usually develop + +having said that a complication called + +hyperosmolar hyperglycemic state or HHS + +is much more common in type 2 diabetes + +than type 1 diabetes and it causes + +increased plasma osmolarity due to + +extreme dehydration and concentration of + +the blood to help understand this + +remember that glucose is a polar + +molecule that cannot passively diffuse + +across cell membranes which means that + +it acts as a solute so when levels of + +glucose are super high in the blood + +meaning it's a hyperosmolar State water + +starts to leave the body cells and enter + +the blood vessels leaving the cells were + +relatively dry in travailed rather than + +plump and juicy blood vessels that are + +full of water lead to increased + +urination and total body dehydration and + +this is a very serious situation because + +the dehydration of the body's cells and + +in particular the brain can cause a + +number of symptoms including mental + +status changes in HHS you can sometimes + +see mild ketone emia and acidosis but + +not to the extent that it's seen in DKA + +and in DKA you can see some hyper + +osmolarity so there's definitely overlap + +between these two syndromes + +besides type 1 and type 2 diabetes there + +are also a couple other subtypes of + +diabetes mellitus gestational diabetes + +is when pregnant women have increased + +blood glucose which is particularly + +during the third trimester although + +ultimately unknown the cause is thought + +to be related to pregnancy hormones that + +interfere with insulins action on + +insulin receptors also sometimes people + +can develop drug-induced diabetes which + +is where medications have side effects + +that tend to increase blood glucose + +levels the mechanism for both of these + +is thought to be related to insulin + +resistance like type 2 diabetes rather + +than an autoimmune destruction process + +like in type 1 diabetes diagnosing type + +1 or type 2 diabetes is done by getting + +a sense for how much glucose is floating + +around in the blood and has specific + +standards that the World Health + +Organization uses very commonly a + +fasting glucose test is taken where the + +person doesn't eat or drink except the + +water that's okay for a total of eight + +hours and then has their blood tested + +for glucose levels levels of 100 + +milligrams per deciliter to 120 + +five milligrams per deciliter indicates + +pre-diabetes and 126 milligrams per + +deciliter or higher indicates diabetes a + +non fasting a random glucose test can be + +done at any time with 200 milligrams per + +deciliter or higher being a red flag for + +diabetes another test is called an oral + +glucose tolerance test where person is + +given glucose and then blood samples are + +taken at time intervals to figure out + +how well it's being cleared from the + +blood the most important interval being + +two hours later levels of 140 milligrams + +per deciliter to 199 milligrams per + +deciliter indicate pre-diabetes + +and 200 or above indicates diabetes + +another thing to know is that when blood + +glucose levels get high the glucose can + +also stick to proteins that are floating + +around in the blood or in cells so that + +brings us to another type of test that + +can be done which is the hba1c test + +which tests for the proportion of + +hemoglobin in red blood cells that has + +glucose stuck to it called glycated + +hemoglobin hba1c levels of 5.7% 26.4% + +indicate pre-diabetes + +and 6.5 percent or higher indicates + +diabetes this proportion of glycated + +hemoglobin doesn't change day to day so + +it gives a sense for whether the blood + +glucose levels have been high over the + +past two to three months finally we have + +the c-peptide test which tests for + +byproducts of insulin production if the + +level of c-peptide is low or absent it + +means the pancreas is no longer + +producing enough insulin and the glucose + +cannot enter the cells + +for type one diabetes insulin is the + +only treatment option for type 2 + +diabetes on the other hand lifestyle + +changes like weight loss and exercise + +along with a healthy diet and an oral + +anti-diabetic medication like metformin + +in several other classes can sometimes + +be enough to reverse some of that + +insulin resistance and keep blood sugar + +levels in check however if oral + +anti-diabetic medications fail type 2 + +diabetes can also be treated with + +insulin something to bear in mind is + +that insulin treatment comes with a risk + +of hypoglycemia especially if insulin is + +taken without a meal symptoms of + +hypoglycemia can be mild like weakness + +hunger and shaking but they can progress + +to a loss of consciousness in seizures + +in severe cases in mild cases drinking + +juices or eating candy or sugar might be + +enough to bring blood sugar up but in + +severe cases intravenous glucose should + +be given as soon as possible + +the FDA has also recently approved + +intranasal glucagon as a treatment for + +severe hypoglycemia all right now over + +time high glucose levels can cause + +damage to tiny blood vessels while the + +micro vasculature in arterioles a + +process called hyaline + +arteriolosclerosis is where the walls of + +the arterioles develop hyaline deposits + +which are deposits of proteins and these + +make them hard and inflexible in + +capillaries the basement membrane can + +thicken and make it difficult for oxygen + +to easily move from the capillary to the + +tissues causing hypoxia + +one of the most significant effects is + +that diabetes increases the risk of + +medium and large arterial wall damage + +and subsequent atherosclerosis which can + +lead to heart attacks and strokes which + +are major causes of morbidity and + +mortality for patients with diabetes in + +the eyes diabetes can lead to + +retinopathy and evidence of that can be + +seen on a fundus copic exam that shows + +cotton-wool spots or flare hemorrhages + +and can eventually cause blindness in + +the kidneys the a ferrant and efferent + +arterioles as well as the glomerulus + +itself can get damaged which can lead to + +an F Radek syndrome that slowly + +diminishes the kidneys ability to filter + +blood over time and can ultimately lead + +to dialysis diabetes can also affect the + +function of nerves causing symptoms like + +a decrease in sensation in the toes and + +fingers sometimes called a stocking + +glove distribution as well as causes the + +autonomic nervous system to malfunction + +and that system controls a number of + +body functions + +everything from sweating to passing gas + +finally both the poor blood supply and + +nerve damage can lead to ulcers + +typically on the feet that don't heal + +quickly and can get pretty severe and + +need to be amputated these are some of + +the complications of uncontrolled + +diabetes which is why it's important to + +diagnose and control diabetes through a + +healthy lifestyle medications to reduce + +insulin resistance and even insulin + +therapy if beta cells have been + +exhausted while type 1 diabetes cannot + +be prevented type 2 diabetes can in fact + +many people with diabetes can control + +their blood sugar levels really + +effectively and live a full and active + +life without any of the complications + +thanks for watching if you're interested + +in a deeper dive on this topic take a + +look at as Moses org where we have + +flashcards questions and other awesome + +tools to help you learn medicine + +you + From 7ee049bad64360a1f1337cb785b7b981a5745287 Mon Sep 17 00:00:00 2001 From: Gokul Mohanarangan Date: Thu, 10 Aug 2023 12:09:52 +0530 Subject: [PATCH 2/6] update deps --- server/poetry.lock | 238 +++++++++++++++++++++++++++++++++++++++++- server/pyproject.toml | 3 + 2 files changed, 237 insertions(+), 4 deletions(-) diff --git a/server/poetry.lock b/server/poetry.lock index b0ded524..b99ae6a0 100644 --- a/server/poetry.lock +++ b/server/poetry.lock @@ -1337,6 +1337,21 @@ files = [ {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, ] +[[package]] +name = "jiwer" +version = "3.0.2" +description = "Evaluate your speech-to-text system with similarity measures such as word error rate (WER)" +optional = false +python-versions = ">=3.7,<4.0" +files = [ + {file = "jiwer-3.0.2-py3-none-any.whl", hash = "sha256:cbf5872e0431942847765e444b338cd74ed9a96682532da87e46575010b76fd2"}, + {file = "jiwer-3.0.2.tar.gz", hash = "sha256:52bb2a3dba0589e85e6598c40c413e1f2a6a2f7f7c57ce9f7b2c094039618235"}, +] + +[package.dependencies] +click = ">=8.1.3,<9.0.0" +rapidfuzz = "2.13.7" + [[package]] name = "jmespath" version = "1.0.1" @@ -1348,6 +1363,120 @@ files = [ {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, ] +[[package]] +name = "levenshtein" +version = "0.21.1" +description = "Python extension for computing string edit distances and similarities." +optional = false +python-versions = ">=3.6" +files = [ + {file = "Levenshtein-0.21.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:59e5054c9dea821840af4623a4059c8f0ae56548a5eae8b9c7aaa0b3f1e33340"}, + {file = "Levenshtein-0.21.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:11694c6f7119d68cc199ff3b1407560c0efb0cc49f288169f28b2e032ee03cda"}, + {file = "Levenshtein-0.21.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f5f7ce639bea0f5e95a1f71963624b85521a39928a2a1bb0e66f6180facf5969"}, + {file = "Levenshtein-0.21.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:39e8a1866325b6d54de4e7d1bffffaf4b4c8cbf0988f47f0f2e929edfbeb870d"}, + {file = "Levenshtein-0.21.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ed73d619e203aad54e2e6119a2b58b7568a36bd50a547817d13618ea0acf4412"}, + {file = "Levenshtein-0.21.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:50fbe01be99554f644657c32a9e3085369d23e8ccc540d855c683947d3b48b67"}, + {file = "Levenshtein-0.21.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675ba3afaa9e8ec393eb1eeee651697036e8391be54e6c28eae4bfdff4d5e64e"}, + {file = "Levenshtein-0.21.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c89a5ac319a80c131ca8d499ae0f7a91d4dd1dc3b2e9d8b095e991597b79c8f9"}, + {file = "Levenshtein-0.21.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:f9e3a5f4386c8f1811153f309a0ba3dc47d17e81a6dd29aa22d3e10212a2fd73"}, + {file = "Levenshtein-0.21.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ea042ba262ea2a95d93c4d2d5879df956cf6c85ce22c037e3f0d4491182f10c5"}, + {file = "Levenshtein-0.21.1-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:622bc670b906c4bf219755625e9fa704ff07c561a90f1aa35f3f2d8ecd3ec088"}, + {file = "Levenshtein-0.21.1-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:f0e51ff6d5665884b0e39b4ae0ef4e2d2d0174147147db7a870ddc4123882212"}, + {file = "Levenshtein-0.21.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cc8eb12c48598b20b4b99128bc2bd62974dfb65204ceb37807480788b1e66e64"}, + {file = "Levenshtein-0.21.1-cp310-cp310-win32.whl", hash = "sha256:04d338c9153ddf70a32f324cf9f902fe94a6da82122b8037ccde969d4cc0a94b"}, + {file = "Levenshtein-0.21.1-cp310-cp310-win_amd64.whl", hash = "sha256:5a10fc3be2bfb05b03b868d462941e4099b680b7f358a90b8c6d7d5946e9e97c"}, + {file = "Levenshtein-0.21.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:938581ba87b306675bc41e21c2b2822a9eb83fb1a0e4a4903b7398d7845b22e3"}, + {file = "Levenshtein-0.21.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:06da6c47aa459c725ee90dab467cd2f66956c5f9a43ddb51a0fe2496960f1d3e"}, + {file = "Levenshtein-0.21.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:eea308d98c64dbea48ac351011c4adf66acd936c4de2bf9955826ba8435197e2"}, + {file = "Levenshtein-0.21.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a51974fcb8a94284325cb88b474b76227532a25b035938a46167bebd1646718e"}, + {file = "Levenshtein-0.21.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:87edb05fc6e4eb14008433f02e89815a756fe4ecc32d7180bb757f26e4161e06"}, + {file = "Levenshtein-0.21.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:aee4f570652ad77961e5ab871d11fd42752e7d2117b08324a0c8801a7ee0a7c5"}, + {file = "Levenshtein-0.21.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:43a06b0b492e0d936deff751ad4757786ba7cb5eee510d53b6dfe92c924ff733"}, + {file = "Levenshtein-0.21.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:952e72f173a65f271dfee102b5571004b6594d4f199864ddead77115a2c147fd"}, + {file = "Levenshtein-0.21.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d3f855669e1399597f7a2670310cf20fc04a35c6c446dd70320398e9aa481b3d"}, + {file = "Levenshtein-0.21.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:ca992783feaf1d6e25403340157fb584cf71371b094a575134393bba10b974fa"}, + {file = "Levenshtein-0.21.1-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:20361f42f6e7efa5853f69a41a272e9ecb90da284bec4312e42b58fa42b9a752"}, + {file = "Levenshtein-0.21.1-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:9bcb3abbe97975cc6a97baf24a3b6e0491472ecedbc0247a41eb2c8d73ecde5d"}, + {file = "Levenshtein-0.21.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:72b0b84adc52f4cf970a1bb276e76e115b30d693d6dbcd25fca0bcee85ca7cc7"}, + {file = "Levenshtein-0.21.1-cp311-cp311-win32.whl", hash = "sha256:4217ae380f42f825862eb8e2f9beca627fe9ab613f36e206842c003bb1affafc"}, + {file = "Levenshtein-0.21.1-cp311-cp311-win_amd64.whl", hash = "sha256:12bb3540e021c73c5d8796ecf8148afd441c4471731924a112bc31bc25abeabf"}, + {file = "Levenshtein-0.21.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:a0fa251b3b4c561d2f650d9a61fb8980815492bb088a0a521236995a1872e171"}, + {file = "Levenshtein-0.21.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d4bf11b89d8d7a7707ae5cac1ef86ac4ff78491482df037289470db8f0378043"}, + {file = "Levenshtein-0.21.1-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91dca7085aa358da71fa50682fc8ff7e21365c99ef17dc1962a7bbf488003528"}, + {file = "Levenshtein-0.21.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f4f187f0929a35b6ddabc1324161e8c73ddbd4a7747249f10ec9ceaa793e904f"}, + {file = "Levenshtein-0.21.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d997da10fdf1a82e208fd1b05aba40705ca3f053919c84d2e952141d33e3ab3"}, + {file = "Levenshtein-0.21.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ed8f99e4e4ba8a43bb4fe0255606724f22069405fa1e3be679a2d90f74770e5"}, + {file = "Levenshtein-0.21.1-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:5acb7e84ccd619dcff6e04928fa8d8cc24f55bb2c9cdfe96620ed85b0a82a7c7"}, + {file = "Levenshtein-0.21.1-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:62dca15301bdba4ec7fcf53c39dd8d9c198194990cf035def3f47b7cb9c3213e"}, + {file = "Levenshtein-0.21.1-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:832951ad7b5ee0df8152f239a9fc602322da055264459dcf4d50d3ed68e68045"}, + {file = "Levenshtein-0.21.1-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:e8ab4d5acdd3ac17161539d9f2ea764497dc269dcd8dc722ae4a394c7b64ae7f"}, + {file = "Levenshtein-0.21.1-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:3c13450450d537ec7ede3781be72d72db37cb131943148c8ada58b34e143fc6f"}, + {file = "Levenshtein-0.21.1-cp36-cp36m-win32.whl", hash = "sha256:267ad98befffeed90e73b8c644a297027adb81f61044843aeade7b4a44ccc7d7"}, + {file = "Levenshtein-0.21.1-cp36-cp36m-win_amd64.whl", hash = "sha256:d66d8f3ebde14840a310a557c8f69eed3e153f2477747365355d058208eea515"}, + {file = "Levenshtein-0.21.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:78d0fb5faef0413864c1b593e5261a840eaa47842b0fa4af7be4c09d90b24a14"}, + {file = "Levenshtein-0.21.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9dda976c1dae2a0b41a109facc48d1d242c7acb30ab4c04d8421496da6e153aa"}, + {file = "Levenshtein-0.21.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1dc54aeb02f38a36f16bca6b0f9d07462686d92716424d9a4a3fdd11f3624528"}, + {file = "Levenshtein-0.21.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:463fd7558f25c477c7e4a59af35c661e133473f62bb02ed2c07c9c95e1c2dc66"}, + {file = "Levenshtein-0.21.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f00495a80c5850466f0a57ea874761f78079702e28b63a1b6573ad254f828e44"}, + {file = "Levenshtein-0.21.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:31aa08e8ddac402edd530aaf708ab085fea7299c499404989eabfde143377911"}, + {file = "Levenshtein-0.21.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:9e96217a7c6a7d43071c830b1353a3ee669757ae477673f0fd3e3a97def6d410"}, + {file = "Levenshtein-0.21.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:d83b8c0ce41e410af143bd3abef94e480d143fdb83e60a01bab9069bf565dada"}, + {file = "Levenshtein-0.21.1-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:309f134f3d42fa7df7efbbd7975f2331de8c36da3ebdb3fad59abae84268abba"}, + {file = "Levenshtein-0.21.1-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:267bc6725506571fd3c03afcc871fa5cbf3d2cb6e4bd11043790fa60cbb0f8a4"}, + {file = "Levenshtein-0.21.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:4a6cd85ac5f7800e8127b3194fa02c59be735b6bdfe55b8516d094652235e038"}, + {file = "Levenshtein-0.21.1-cp37-cp37m-win32.whl", hash = "sha256:13e87517ce788d71deaa73e37332a67c4085c13e58ea3a0218092d555d1872ce"}, + {file = "Levenshtein-0.21.1-cp37-cp37m-win_amd64.whl", hash = "sha256:918f2e0f590cacb30edb88e7eccbf71b340d5f080c9e69009f1f00dc24810a67"}, + {file = "Levenshtein-0.21.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:d17c2ee8aa380c012b3ba015b87502934662c51b7609ef17366c76863e9551d6"}, + {file = "Levenshtein-0.21.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ee847d3e49870e914074fd31c069a1aaba6f71bee650d41de48e7e4b11671bf0"}, + {file = "Levenshtein-0.21.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8d01425bd54c482ccbbc6d953633450a2bdbb7d12450d9eeba6073a6d0f06a3c"}, + {file = "Levenshtein-0.21.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bff4f236d1b6c556a77975812a4d51071181721f3a29c08b42e5c4aa11730957"}, + {file = "Levenshtein-0.21.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:35a603d952e9f286fe8053332862c8cff426f5d8a85ee962c3a0f597f4c463c4"}, + {file = "Levenshtein-0.21.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9546ded45fb3cf8773ade9c91de164c6cb2cb4927516289abd422a262e81906c"}, + {file = "Levenshtein-0.21.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:79259b10f105f78853210d8769cf77ca55dac8c368dca33b4c10ffa8965e2543"}, + {file = "Levenshtein-0.21.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:41e0e539638a27b5e90a5d46679375f93a1cb65cf06efe7c413cf76f71d3d467"}, + {file = "Levenshtein-0.21.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ccd0b89300a25decdb34d7c4efe2a971438015f552eeb416b8da12918cb3edc0"}, + {file = "Levenshtein-0.21.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ef365ec78938597623d4fb96c8b0db423ab484fcfc00fae44c34b738b1eb1924"}, + {file = "Levenshtein-0.21.1-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:e701b9dfb121faf71b0c5757485fc49e1b511b7b8a80034aa1f580488f8f872e"}, + {file = "Levenshtein-0.21.1-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:e4c2fe1f49f1d8476fe44e243569d775c5454dca70a13be568430d2d2d760ea2"}, + {file = "Levenshtein-0.21.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:40a5e38d0c3e488d1dca5dc9c2691c000764813d4006c243f2ebd39e0b331e95"}, + {file = "Levenshtein-0.21.1-cp38-cp38-win32.whl", hash = "sha256:6c08879d0cf761cd750e976fda67bcc23cf1e485eaa030942e6628b876f4c6d8"}, + {file = "Levenshtein-0.21.1-cp38-cp38-win_amd64.whl", hash = "sha256:248348e94dee05c787b44f16533a366ec5bf8ba949c604ad0db69d0c872f3539"}, + {file = "Levenshtein-0.21.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:3824e9f75ec9f373fc8b4df23eae668918953487f5ff06db282ddcb3f9c802d2"}, + {file = "Levenshtein-0.21.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2e2ed817fa682243ef2e8a2728fcd0f9352d4e5edd104db44862d0bb55c75a7e"}, + {file = "Levenshtein-0.21.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:94a6ffd7257d12c64de34bc9f801a211e2daa624ec276305f8c67963a9896efa"}, + {file = "Levenshtein-0.21.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6833f8cefb96b8ccac457ad421866a74f4de973e7001699fcbbbe9ccb59a5c66"}, + {file = "Levenshtein-0.21.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c8126d2b51621483823c6e31d16bc1f5a964ae976aab4f241bbe74ed19d93770"}, + {file = "Levenshtein-0.21.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:58eaab403b77e62e096cbcbaf61728c8736f9f7a3e36a58fb663461e5d70144f"}, + {file = "Levenshtein-0.21.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47e6d66fe0110fd8e6efb1939d686099170c27b3ca838eab0c215f0781f05f06"}, + {file = "Levenshtein-0.21.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f5a1f28b34a15dd2d67bcc324f6661df8cfe66d6ec7ee7a64e921af8ae4c39b7"}, + {file = "Levenshtein-0.21.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:c37609f4e460e570810ec5176c5cdf91c494a9979638f7fef5fd345597245d17"}, + {file = "Levenshtein-0.21.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:656c70814280c4002af89112f1457b6ad24c42dfba58dcb2047a249ae8ccdd04"}, + {file = "Levenshtein-0.21.1-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:863d507cba67de2fa66d5501ed1bc5029363d2b393662ac7d740dd0330c66aba"}, + {file = "Levenshtein-0.21.1-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:9437c2342937decf3cf5ac79d0b9497734897c0a09dc813378c97f2916b7aa76"}, + {file = "Levenshtein-0.21.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a1cd48db3d03adb88bf71b45de77b9720f96d3b9d5ab7a32304352baec482689"}, + {file = "Levenshtein-0.21.1-cp39-cp39-win32.whl", hash = "sha256:023dffdde576639e48cab3cc835bfaf9c441df7a8e2829bf20104868db6e4f72"}, + {file = "Levenshtein-0.21.1-cp39-cp39-win_amd64.whl", hash = "sha256:dcc712696d4332962ecab6e4df40d5126d7379c6612e6058ee2e9d3f924387e3"}, + {file = "Levenshtein-0.21.1-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:9a8d60084e1c9e87ae247c601e331708de09ed23219b5e39af7c8e9115ab8152"}, + {file = "Levenshtein-0.21.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffa6762f8ef1e7dfba101babe43de6edc541cbe64d33d816314ac67cd76c3979"}, + {file = "Levenshtein-0.21.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eec8a1eaaeadc217c15bc77d01bb29e146acdae73a0b2e9df1ad162263c9752e"}, + {file = "Levenshtein-0.21.1-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5da0e2dbddb98da890fb779823df991ad50f184b3d986b8c68784eecbb087f01"}, + {file = "Levenshtein-0.21.1-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:edac6490f84e8a0456cb40f6729d4199311ce50ca0ea4958572e1b7ea99f546c"}, + {file = "Levenshtein-0.21.1-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:b33e2cbaca6f7d01092a28711605568dbc08a3bb7b796d8986bf5d0d651a0b09"}, + {file = "Levenshtein-0.21.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69a430ab564d286f309c19f7abed34fce9c144f39f984c609ee690dd175cc421"}, + {file = "Levenshtein-0.21.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f30474b2906301594c8fb64cb7492c6978290c466a717c4b5316887a18b77af5"}, + {file = "Levenshtein-0.21.1-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9817dca597abde9fc9571d56a7eca8bd667e9dfc0867b190f1e8b43ce4fde761"}, + {file = "Levenshtein-0.21.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:7d7e00e8cb45981386df9d3f99073ba7de59bdb739069766b32906421bb1026b"}, + {file = "Levenshtein-0.21.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:c9a072cb0f6e90092c4323cd7731eb539a79ac360045dbe3cc49a123ba381fc5"}, + {file = "Levenshtein-0.21.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2d880a87aca186342bc2fe16b064c3ed434d2a0c170c419f23b4e00261a5340a"}, + {file = "Levenshtein-0.21.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f282711a220d1bdf245da508e1fefdf7680d1f7482a094e37465674a7e6985ae"}, + {file = "Levenshtein-0.21.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cdba9f8a7a98b0c4c0bc004b811fb31a96521cd264aeb5375898478e7703de4d"}, + {file = "Levenshtein-0.21.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:b2410469cc8fd0f42aa00e63063c42f8aff501996cd5424a5c904739bdaaf4fe"}, + {file = "Levenshtein-0.21.1.tar.gz", hash = "sha256:2e4fc4522f9bf73c6ab4cedec834783999b247312ec9e3d1435a5424ad5bc908"}, +] + +[package.dependencies] +rapidfuzz = ">=2.3.0,<4.0.0" + [[package]] name = "loguru" version = "0.7.0" @@ -2085,6 +2214,107 @@ files = [ {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, ] +[[package]] +name = "rapidfuzz" +version = "2.13.7" +description = "rapid fuzzy string matching" +optional = false +python-versions = ">=3.7" +files = [ + {file = "rapidfuzz-2.13.7-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b75dd0928ce8e216f88660ab3d5c5ffe990f4dd682fd1709dba29d5dafdde6de"}, + {file = "rapidfuzz-2.13.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:24d3fea10680d085fd0a4d76e581bfb2b1074e66e78fd5964d4559e1fcd2a2d4"}, + {file = "rapidfuzz-2.13.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8109e0324d21993d5b2d111742bf5958f3516bf8c59f297c5d1cc25a2342eb66"}, + {file = "rapidfuzz-2.13.7-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b5f705652360d520c2de52bee11100c92f59b3e3daca308ebb150cbc58aecdad"}, + {file = "rapidfuzz-2.13.7-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7496e8779905b02abc0ab4ba2a848e802ab99a6e20756ffc967a0de4900bd3da"}, + {file = "rapidfuzz-2.13.7-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:24eb6b843492bdc63c79ee4b2f104059b7a2201fef17f25177f585d3be03405a"}, + {file = "rapidfuzz-2.13.7-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:467c1505362823a5af12b10234cb1c4771ccf124c00e3fc9a43696512bd52293"}, + {file = "rapidfuzz-2.13.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:53dcae85956853b787c27c1cb06f18bb450e22cf57a4ad3444cf03b8ff31724a"}, + {file = "rapidfuzz-2.13.7-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:46b9b8aa09998bc48dd800854e8d9b74bc534d7922c1d6e1bbf783e7fa6ac29c"}, + {file = "rapidfuzz-2.13.7-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:1fbad8fb28d98980f5bff33c7842efef0315d42f0cd59082108482a7e6b61410"}, + {file = "rapidfuzz-2.13.7-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:43fb8cb030f888c3f076d40d428ed5eb4331f5dd6cf1796cfa39c67bf0f0fc1e"}, + {file = "rapidfuzz-2.13.7-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:b6bad92de071cbffa2acd4239c1779f66851b60ffbbda0e4f4e8a2e9b17e7eef"}, + {file = "rapidfuzz-2.13.7-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d00df2e4a81ffa56a6b1ec4d2bc29afdcb7f565e0b8cd3092fece2290c4c7a79"}, + {file = "rapidfuzz-2.13.7-cp310-cp310-win32.whl", hash = "sha256:2c836f0f2d33d4614c3fbaf9a1eb5407c0fe23f8876f47fd15b90f78daa64c34"}, + {file = "rapidfuzz-2.13.7-cp310-cp310-win_amd64.whl", hash = "sha256:c36fd260084bb636b9400bb92016c6bd81fd80e59ed47f2466f85eda1fc9f782"}, + {file = "rapidfuzz-2.13.7-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:b34e8c0e492949ecdd5da46a1cfc856a342e2f0389b379b1a45a3cdcd3176a6e"}, + {file = "rapidfuzz-2.13.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:875d51b3497439a72e2d76183e1cb5468f3f979ab2ddfc1d1f7dde3b1ecfb42f"}, + {file = "rapidfuzz-2.13.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ae33a72336059213996fe4baca4e0e4860913905c2efb7c991eab33b95a98a0a"}, + {file = "rapidfuzz-2.13.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5585189b3d90d81ccd62d4f18530d5ac8972021f0aaaa1ffc6af387ff1dce75"}, + {file = "rapidfuzz-2.13.7-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:42085d4b154a8232767de8296ac39c8af5bccee6b823b0507de35f51c9cbc2d7"}, + {file = "rapidfuzz-2.13.7-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:585206112c294e335d84de5d5f179c0f932837752d7420e3de21db7fdc476278"}, + {file = "rapidfuzz-2.13.7-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f891b98f8bc6c9d521785816085e9657212621e93f223917fb8e32f318b2957e"}, + {file = "rapidfuzz-2.13.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:08590905a95ccfa43f4df353dcc5d28c15d70664299c64abcad8721d89adce4f"}, + {file = "rapidfuzz-2.13.7-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b5dd713a1734574c2850c566ac4286594bacbc2d60b9170b795bee4b68656625"}, + {file = "rapidfuzz-2.13.7-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:988f8f6abfba7ee79449f8b50687c174733b079521c3cc121d65ad2d38831846"}, + {file = "rapidfuzz-2.13.7-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:b3210869161a864f3831635bb13d24f4708c0aa7208ef5baac1ac4d46e9b4208"}, + {file = "rapidfuzz-2.13.7-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:f6fe570e20e293eb50491ae14ddeef71a6a7e5f59d7e791393ffa99b13f1f8c2"}, + {file = "rapidfuzz-2.13.7-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6120f2995f5154057454c5de99d86b4ef3b38397899b5da1265467e8980b2f60"}, + {file = "rapidfuzz-2.13.7-cp311-cp311-win32.whl", hash = "sha256:b20141fa6cee041917801de0bab503447196d372d4c7ee9a03721b0a8edf5337"}, + {file = "rapidfuzz-2.13.7-cp311-cp311-win_amd64.whl", hash = "sha256:ec55a81ac2b0f41b8d6fb29aad16e55417036c7563bad5568686931aa4ff08f7"}, + {file = "rapidfuzz-2.13.7-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:7d005e058d86f2a968a8d28ca6f2052fab1f124a39035aa0523261d6baf21e1f"}, + {file = "rapidfuzz-2.13.7-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fe59a0c21a032024edb0c8e43f5dee5623fef0b65a1e3c1281836d9ce199af3b"}, + {file = "rapidfuzz-2.13.7-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cdfc04f7647c29fb48da7a04082c34cdb16f878d3c6d098d62d5715c0ad3000c"}, + {file = "rapidfuzz-2.13.7-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:68a89bb06d5a331511961f4d3fa7606f8e21237467ba9997cae6f67a1c2c2b9e"}, + {file = "rapidfuzz-2.13.7-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:effe182767d102cb65dfbbf74192237dbd22d4191928d59415aa7d7c861d8c88"}, + {file = "rapidfuzz-2.13.7-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:25b4cedf2aa19fb7212894ce5f5219010cce611b60350e9a0a4d492122e7b351"}, + {file = "rapidfuzz-2.13.7-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:3a9bd02e1679c0fd2ecf69b72d0652dbe2a9844eaf04a36ddf4adfbd70010e95"}, + {file = "rapidfuzz-2.13.7-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:5e2b3d020219baa75f82a4e24b7c8adcb598c62f0e54e763c39361a9e5bad510"}, + {file = "rapidfuzz-2.13.7-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:cf62dacb3f9234f3fddd74e178e6d25c68f2067fde765f1d95f87b1381248f58"}, + {file = "rapidfuzz-2.13.7-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:fa263135b892686e11d5b84f6a1892523123a00b7e5882eff4fbdabb38667347"}, + {file = "rapidfuzz-2.13.7-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:fa4c598ed77f74ec973247ca776341200b0f93ec3883e34c222907ce72cb92a4"}, + {file = "rapidfuzz-2.13.7-cp37-cp37m-win32.whl", hash = "sha256:c2523f8180ebd9796c18d809e9a19075a1060b1a170fde3799e83db940c1b6d5"}, + {file = "rapidfuzz-2.13.7-cp37-cp37m-win_amd64.whl", hash = "sha256:5ada0a14c67452358c1ee52ad14b80517a87b944897aaec3e875279371a9cb96"}, + {file = "rapidfuzz-2.13.7-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ca8a23097c1f50e0fdb4de9e427537ca122a18df2eead06ed39c3a0bef6d9d3a"}, + {file = "rapidfuzz-2.13.7-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9be02162af0376d64b840f2fc8ee3366794fc149f1e06d095a6a1d42447d97c5"}, + {file = "rapidfuzz-2.13.7-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:af4f7c3c904ca709493eb66ca9080b44190c38e9ecb3b48b96d38825d5672559"}, + {file = "rapidfuzz-2.13.7-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f50d1227e6e2a0e3ae1fb1c9a2e1c59577d3051af72c7cab2bcc430cb5e18da"}, + {file = "rapidfuzz-2.13.7-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c71d9d512b76f05fa00282227c2ae884abb60e09f08b5ca3132b7e7431ac7f0d"}, + {file = "rapidfuzz-2.13.7-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b52ac2626945cd21a2487aeefed794c14ee31514c8ae69b7599170418211e6f6"}, + {file = "rapidfuzz-2.13.7-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ca00fafd2756bc9649bf80f1cf72c647dce38635f0695d7ce804bc0f759aa756"}, + {file = "rapidfuzz-2.13.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d248a109699ce9992304e79c1f8735c82cc4c1386cd8e27027329c0549f248a2"}, + {file = "rapidfuzz-2.13.7-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:c88adbcb933f6b8612f6c593384bf824e562bb35fc8a0f55fac690ab5b3486e5"}, + {file = "rapidfuzz-2.13.7-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:c8601a66fbfc0052bb7860d2eacd303fcde3c14e87fdde409eceff516d659e77"}, + {file = "rapidfuzz-2.13.7-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:27be9c63215d302ede7d654142a2e21f0d34ea6acba512a4ae4cfd52bbaa5b59"}, + {file = "rapidfuzz-2.13.7-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:3dcffe1f3cbda0dc32133a2ae2255526561ca594f15f9644384549037b355245"}, + {file = "rapidfuzz-2.13.7-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8450d15f7765482e86ef9be2ad1a05683cd826f59ad236ef7b9fb606464a56aa"}, + {file = "rapidfuzz-2.13.7-cp38-cp38-win32.whl", hash = "sha256:460853983ab88f873173e27cc601c5276d469388e6ad6e08c4fd57b2a86f1064"}, + {file = "rapidfuzz-2.13.7-cp38-cp38-win_amd64.whl", hash = "sha256:424f82c35dbe4f83bdc3b490d7d696a1dc6423b3d911460f5493b7ffae999fd2"}, + {file = "rapidfuzz-2.13.7-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c3fbe449d869ea4d0909fc9d862007fb39a584fb0b73349a6aab336f0d90eaed"}, + {file = "rapidfuzz-2.13.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:16080c05a63d6042643ae9b6cfec1aefd3e61cef53d0abe0df3069b9d4b72077"}, + {file = "rapidfuzz-2.13.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:dbcf5371ea704759fcce772c66a07647751d1f5dbdec7818331c9b31ae996c77"}, + {file = "rapidfuzz-2.13.7-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:114810491efb25464016fd554fdf1e20d390309cecef62587494fc474d4b926f"}, + {file = "rapidfuzz-2.13.7-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:99a84ab9ac9a823e7e93b4414f86344052a5f3e23b23aa365cda01393ad895bd"}, + {file = "rapidfuzz-2.13.7-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:81642a24798851b118f82884205fc1bd9ff70b655c04018c467824b6ecc1fabc"}, + {file = "rapidfuzz-2.13.7-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c3741cb0bf9794783028e8b0cf23dab917fa5e37a6093b94c4c2f805f8e36b9f"}, + {file = "rapidfuzz-2.13.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:759a3361711586a29bc753d3d1bdb862983bd9b9f37fbd7f6216c24f7c972554"}, + {file = "rapidfuzz-2.13.7-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:1333fb3d603d6b1040e365dca4892ba72c7e896df77a54eae27dc07db90906e3"}, + {file = "rapidfuzz-2.13.7-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:916bc2e6cf492c77ad6deb7bcd088f0ce9c607aaeabc543edeb703e1fbc43e31"}, + {file = "rapidfuzz-2.13.7-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:23524635840500ce6f4d25005c9529a97621689c85d2f727c52eed1782839a6a"}, + {file = "rapidfuzz-2.13.7-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:ebe303cd9839af69dd1f7942acaa80b1ba90bacef2e7ded9347fbed4f1654672"}, + {file = "rapidfuzz-2.13.7-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:fe56659ccadbee97908132135de4b875543353351e0c92e736b7c57aee298b5a"}, + {file = "rapidfuzz-2.13.7-cp39-cp39-win32.whl", hash = "sha256:3f11a7eff7bc6301cd6a5d43f309e22a815af07e1f08eeb2182892fca04c86cb"}, + {file = "rapidfuzz-2.13.7-cp39-cp39-win_amd64.whl", hash = "sha256:e8914dad106dacb0775718e54bf15e528055c4e92fb2677842996f2d52da5069"}, + {file = "rapidfuzz-2.13.7-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7f7930adf84301797c3f09c94b9c5a9ed90a9e8b8ed19b41d2384937e0f9f5bd"}, + {file = "rapidfuzz-2.13.7-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c31022d9970177f6affc6d5dd757ed22e44a10890212032fabab903fdee3bfe7"}, + {file = "rapidfuzz-2.13.7-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f42b82f268689f429def9ecfb86fa65ceea0eaf3fed408b570fe113311bf5ce7"}, + {file = "rapidfuzz-2.13.7-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8b477b43ced896301665183a5e0faec0f5aea2373005648da8bdcb3c4b73f280"}, + {file = "rapidfuzz-2.13.7-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:d63def9bbc6b35aef4d76dc740301a4185867e8870cbb8719ec9de672212fca8"}, + {file = "rapidfuzz-2.13.7-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:c66546e30addb04a16cd864f10f5821272a1bfe6462ee5605613b4f1cb6f7b48"}, + {file = "rapidfuzz-2.13.7-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f799d1d6c33d81e983d3682571cc7d993ae7ff772c19b3aabb767039c33f6d1e"}, + {file = "rapidfuzz-2.13.7-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d82f20c0060ffdaadaf642b88ab0aa52365b56dffae812e188e5bdb998043588"}, + {file = "rapidfuzz-2.13.7-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:042644133244bfa7b20de635d500eb9f46af7097f3d90b1724f94866f17cb55e"}, + {file = "rapidfuzz-2.13.7-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:75c45dcd595f8178412367e302fd022860ea025dc4a78b197b35428081ed33d5"}, + {file = "rapidfuzz-2.13.7-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:3d8b081988d0a49c486e4e845a547565fee7c6e7ad8be57ff29c3d7c14c6894c"}, + {file = "rapidfuzz-2.13.7-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:16ffad751f43ab61001187b3fb4a9447ec2d1aedeff7c5bac86d3b95f9980cc3"}, + {file = "rapidfuzz-2.13.7-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:020858dd89b60ce38811cd6e37875c4c3c8d7fcd8bc20a0ad2ed1f464b34dc4e"}, + {file = "rapidfuzz-2.13.7-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cda1e2f66bb4ba7261a0f4c2d052d5d909798fca557cbff68f8a79a87d66a18f"}, + {file = "rapidfuzz-2.13.7-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:b6389c50d8d214c9cd11a77f6d501529cb23279a9c9cafe519a3a4b503b5f72a"}, + {file = "rapidfuzz-2.13.7.tar.gz", hash = "sha256:8d3e252d4127c79b4d7c2ae47271636cbaca905c8bb46d80c7930ab906cf4b5c"}, +] + +[package.extras] +full = ["numpy"] + [[package]] name = "requests" version = "2.31.0" @@ -2337,13 +2567,13 @@ testing = ["black (==22.3)", "datasets", "numpy", "pytest", "requests"] [[package]] name = "tqdm" -version = "4.65.1" +version = "4.66.0" description = "Fast, Extensible Progress Meter" optional = false python-versions = ">=3.7" files = [ - {file = "tqdm-4.65.1-py3-none-any.whl", hash = "sha256:16181c62ad2c6f8f6f29876e66322faad1c7fd3cc70aa9cc25ff63e50d1da031"}, - {file = "tqdm-4.65.1.tar.gz", hash = "sha256:2cb0075cc5269f8edac40bdeb757cc36ab5b6648caf014822b67e1a49fba141d"}, + {file = "tqdm-4.66.0-py3-none-any.whl", hash = "sha256:39d459c7140b7890174e69d4d68d6291bc774a55b4bc5d93c0b760798ac5a03e"}, + {file = "tqdm-4.66.0.tar.gz", hash = "sha256:cc6e7e52202d894e66632c5c8a9330bd0e3ff35d2965c93ca832114a3d865362"}, ] [package.dependencies] @@ -2766,4 +2996,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "75afc46634677cd9afdf2ae66b320a8eaaa36d360d0ba187e5974b90810df44f" +content-hash = "d6884c33e9aeded69df66c1fd60cd55edc121ed20d10037967a4ea1dcb62b055" diff --git a/server/pyproject.toml b/server/pyproject.toml index 039e1f5a..e27ad8a5 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -41,6 +41,9 @@ pytest-asyncio = "^0.21.1" pytest = "^7.4.0" httpx-ws = "^0.4.1" pytest-httpx = "^0.23.1" +jiwer = "^3.0.2" +levenshtein = "^0.21.1" +tqdm = "^4.66.0" [tool.poetry.group.aws.dependencies] From 0a11a7f6698303f03816e89e83490474b2a069d4 Mon Sep 17 00:00:00 2001 From: Gokul Mohanarangan Date: Thu, 10 Aug 2023 12:16:39 +0530 Subject: [PATCH 3/6] change dependency group --- server/poetry.lock | 2 +- server/pyproject.toml | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/server/poetry.lock b/server/poetry.lock index b99ae6a0..4c309f6c 100644 --- a/server/poetry.lock +++ b/server/poetry.lock @@ -2996,4 +2996,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "d6884c33e9aeded69df66c1fd60cd55edc121ed20d10037967a4ea1dcb62b055" +content-hash = "a5cd48fcfc629c2cd2f4fcc4263c57f867d84acf60824eaf952e365578374d1d" diff --git a/server/pyproject.toml b/server/pyproject.toml index e27ad8a5..3da319bb 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -41,14 +41,17 @@ pytest-asyncio = "^0.21.1" pytest = "^7.4.0" httpx-ws = "^0.4.1" pytest-httpx = "^0.23.1" -jiwer = "^3.0.2" -levenshtein = "^0.21.1" -tqdm = "^4.66.0" [tool.poetry.group.aws.dependencies] aioboto3 = "^11.2.0" + +[tool.poetry.group.evaluation.dependencies] +jiwer = "^3.0.2" +levenshtein = "^0.21.1" +tqdm = "^4.66.0" + [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" From bb983194f8539413c4ffc2e300b114727d578ccf Mon Sep 17 00:00:00 2001 From: Gokul Mohanarangan Date: Thu, 10 Aug 2023 12:24:07 +0530 Subject: [PATCH 4/6] update folder structure --- server/{tests => }/evaluate/__init__.py | 0 server/{tests => }/evaluate/evaluate_transcription.py | 0 server/{tests => }/evaluate/predicted_texts/pred_sample_1.txt | 0 server/{tests => }/evaluate/predicted_texts/pred_sample_2.txt | 0 server/{tests => }/evaluate/predicted_texts/pred_sample_3.txt | 0 server/{tests => }/evaluate/reference_texts/ref_sample_1.txt | 0 server/{tests => }/evaluate/reference_texts/ref_sample_2.txt | 0 server/{tests => }/evaluate/reference_texts/ref_sample_3.txt | 0 8 files changed, 0 insertions(+), 0 deletions(-) rename server/{tests => }/evaluate/__init__.py (100%) rename server/{tests => }/evaluate/evaluate_transcription.py (100%) rename server/{tests => }/evaluate/predicted_texts/pred_sample_1.txt (100%) rename server/{tests => }/evaluate/predicted_texts/pred_sample_2.txt (100%) rename server/{tests => }/evaluate/predicted_texts/pred_sample_3.txt (100%) rename server/{tests => }/evaluate/reference_texts/ref_sample_1.txt (100%) rename server/{tests => }/evaluate/reference_texts/ref_sample_2.txt (100%) rename server/{tests => }/evaluate/reference_texts/ref_sample_3.txt (100%) diff --git a/server/tests/evaluate/__init__.py b/server/evaluate/__init__.py similarity index 100% rename from server/tests/evaluate/__init__.py rename to server/evaluate/__init__.py diff --git a/server/tests/evaluate/evaluate_transcription.py b/server/evaluate/evaluate_transcription.py similarity index 100% rename from server/tests/evaluate/evaluate_transcription.py rename to server/evaluate/evaluate_transcription.py diff --git a/server/tests/evaluate/predicted_texts/pred_sample_1.txt b/server/evaluate/predicted_texts/pred_sample_1.txt similarity index 100% rename from server/tests/evaluate/predicted_texts/pred_sample_1.txt rename to server/evaluate/predicted_texts/pred_sample_1.txt diff --git a/server/tests/evaluate/predicted_texts/pred_sample_2.txt b/server/evaluate/predicted_texts/pred_sample_2.txt similarity index 100% rename from server/tests/evaluate/predicted_texts/pred_sample_2.txt rename to server/evaluate/predicted_texts/pred_sample_2.txt diff --git a/server/tests/evaluate/predicted_texts/pred_sample_3.txt b/server/evaluate/predicted_texts/pred_sample_3.txt similarity index 100% rename from server/tests/evaluate/predicted_texts/pred_sample_3.txt rename to server/evaluate/predicted_texts/pred_sample_3.txt diff --git a/server/tests/evaluate/reference_texts/ref_sample_1.txt b/server/evaluate/reference_texts/ref_sample_1.txt similarity index 100% rename from server/tests/evaluate/reference_texts/ref_sample_1.txt rename to server/evaluate/reference_texts/ref_sample_1.txt diff --git a/server/tests/evaluate/reference_texts/ref_sample_2.txt b/server/evaluate/reference_texts/ref_sample_2.txt similarity index 100% rename from server/tests/evaluate/reference_texts/ref_sample_2.txt rename to server/evaluate/reference_texts/ref_sample_2.txt diff --git a/server/tests/evaluate/reference_texts/ref_sample_3.txt b/server/evaluate/reference_texts/ref_sample_3.txt similarity index 100% rename from server/tests/evaluate/reference_texts/ref_sample_3.txt rename to server/evaluate/reference_texts/ref_sample_3.txt From af954e2818b921685f56fb1ba69c513e16ecc7d1 Mon Sep 17 00:00:00 2001 From: Gokul Mohanarangan Date: Thu, 10 Aug 2023 14:33:46 +0530 Subject: [PATCH 5/6] resolve review comments --- server/evaluate/evaluate_transcription.py | 133 ++++++++++------------ server/poetry.lock | 2 +- server/pyproject.toml | 1 + 3 files changed, 60 insertions(+), 76 deletions(-) diff --git a/server/evaluate/evaluate_transcription.py b/server/evaluate/evaluate_transcription.py index 26c1ce7a..c402f34e 100644 --- a/server/evaluate/evaluate_transcription.py +++ b/server/evaluate/evaluate_transcription.py @@ -1,89 +1,67 @@ -import json -import os import re -from dataclasses import dataclass from pathlib import Path -from typing import List, Union +from typing import Any, List from jiwer import wer from Levenshtein import distance +from pydantic import BaseModel, Field, field_validator from tqdm.auto import tqdm from whisper.normalizers import EnglishTextNormalizer -@dataclass -class EvaluationResult: +class EvaluationResult(BaseModel): """ Result object of the model evaluation """ - - accuracy = float - total_test_samples = int - - def __init__(self, accuracy, total_test_samples): - self.accuracy = accuracy - self.total_test_samples = total_test_samples - - def __repr__(self): - return ( - "EvaluationResult(" - + json.dumps( - { - "accuracy": self.accuracy, - "total_test_samples": self.total_test_samples, - } - ) - + ")" - ) + accuracy: float = Field(default=0.0) + total_test_samples: int = Field(default=0) -@dataclass -class EvaluationTestSample: +class EvaluationTestSample(BaseModel): """ Represents one test sample """ - reference_text = str - predicted_text = str + reference_text: str + predicted_text: str - def __init__(self, reference_text, predicted_text): - self.reference_text = reference_text - self.predicted_text = predicted_text - - def update(self, reference_text, predicted_text): + def update(self, reference_text:str, predicted_text:str) -> None: self.reference_text = reference_text self.predicted_text = predicted_text -class TestDatasetLoader: +class TestDatasetLoader(BaseModel): """ Test samples loader """ - parent_dir = None - total_samples = 0 + test_dir: Path = Field(default=Path(__file__).parent) + total_samples: int = Field(default=0) - def __init__(self, parent_dir: Union[Path | str]): - if isinstance(parent_dir, str): - self.parent_dir = Path(parent_dir) - else: - self.parent_dir = parent_dir + @field_validator("test_dir") + def validate_file_path(cls, path): + """ + Check the file path + """ + if not path.exists(): + raise ValueError("Path does not exist") + return path - def _load_test_data(self) -> tuple[str, str]: + def _load_test_data(self) -> tuple[Path, Path]: """ Loader function to validate inout files and generate samples """ - PREDICTED_TEST_SAMPLES_DIR = self.parent_dir / "predicted_texts" - REFERENCE_TEST_SAMPLES_DIR = self.parent_dir / "reference_texts" + PREDICTED_TEST_SAMPLES_DIR = self.test_dir / "predicted_texts" + REFERENCE_TEST_SAMPLES_DIR = self.test_dir / "reference_texts" - for filename in os.listdir(PREDICTED_TEST_SAMPLES_DIR.as_posix()): - match = re.search(r"(\d+)\.txt$", filename) + for filename in PREDICTED_TEST_SAMPLES_DIR.iterdir(): + match = re.search(r"(\d+)\.txt$", filename.as_posix()) if match: sample_id = match.group(1) - pred_file_path = (PREDICTED_TEST_SAMPLES_DIR / filename).as_posix() + pred_file_path = PREDICTED_TEST_SAMPLES_DIR / filename ref_file_name = "ref_sample_" + str(sample_id) + ".txt" - ref_file_path = (REFERENCE_TEST_SAMPLES_DIR / ref_file_name).as_posix() - if os.path.exists(ref_file_path): + ref_file_path = REFERENCE_TEST_SAMPLES_DIR / ref_file_name + if ref_file_path.exists(): self.total_samples += 1 yield ref_file_path, pred_file_path @@ -96,7 +74,18 @@ class TestDatasetLoader: pred_text = file.read() with open(ref_file_path, "r", encoding="utf-8") as file: ref_text = file.read() - yield EvaluationTestSample(ref_text, pred_text) + yield EvaluationTestSample(reference_text=ref_text, predicted_text=pred_text) + + +class EvaluationConfig(BaseModel): + """ + Model for evaluation parameters + """ + insertion_penalty: int = Field(default=1) + substitution_penalty: int = Field(default=1) + deletion_penalty: int = Field(default=1) + normalizer: Any = Field(default=EnglishTextNormalizer()) + test_directory: str = Field(default=str(Path(__file__).parent)) class ModelEvaluator: @@ -111,38 +100,29 @@ class ModelEvaluator: WEIGHTED_WER_JIWER = 0.0 WER_JIWER = [] - normalizer = None - accuracy = None + evaluation_result = EvaluationResult() test_dataset_loader = None - test_directory = None - evaluation_config = {} + evaluation_config = None def __init__(self, **kwargs): - self.evaluation_config = {k: v for k, v in kwargs.items() if v is not None} - if "normalizer" not in self.evaluation_config: - self.normalizer = EnglishTextNormalizer() - self.evaluation_config["normalizer"] = str(type(self.normalizer)) - if "parent_dir" not in self.evaluation_config: - self.test_directory = Path(__file__).parent - self.test_dataset_loader = TestDatasetLoader(self.test_directory) - self.evaluation_config["test_directory"] = str(self.test_directory) + self.evaluation_config = EvaluationConfig(**kwargs) + self.test_dataset_loader = TestDatasetLoader(test_dir=self.evaluation_config.test_directory) def __repr__(self): - return "ModelEvaluator(" + json.dumps(self.describe(), indent=4) + ")" + return f"ModelEvaluator({self.evaluation_config})" def describe(self) -> dict: """ Returns the parameters defining the evaluator """ - return self.evaluation_config - + return self.evaluation_config.model_dump() def _normalize(self, sample: EvaluationTestSample) -> None: """ Normalize both reference and predicted text """ sample.update( - self.normalizer(sample.reference_text), - self.normalizer(sample.predicted_text), + self.evaluation_config.normalizer(sample.reference_text), + self.evaluation_config.normalizer(sample.predicted_text), ) def _calculate_wer(self, sample: EvaluationTestSample) -> float: @@ -154,9 +134,9 @@ class ModelEvaluator: s1=sample.reference_text, s2=sample.predicted_text, weights=( - self.evaluation_config["insertion_penalty"], - self.evaluation_config["deletion_penalty"], - self.evaluation_config["substitution_penalty"], + self.evaluation_config.insertion_penalty, + self.evaluation_config.deletion_penalty, + self.evaluation_config.substitution_penalty, ), ) wer = levenshtein_distance / len(sample.reference_text) @@ -166,7 +146,7 @@ class ModelEvaluator: """ Compute WER """ - for sample in tqdm(self.test_dataset_loader, desc="Evaluating", ncols=100): + for sample in tqdm(self.test_dataset_loader, desc="Evaluating"): self._normalize(sample) wer_item_l = { "wer": self._calculate_wer(sample), @@ -199,15 +179,18 @@ class ModelEvaluator: weighted_wer_jiwer = self._calculate_weighted_wer(self.WER_JIWER) final_weighted_wer = (weighted_wer_levenshtein + weighted_wer_jiwer) / 2 - self.accuracy = (1 - final_weighted_wer) * 100 + self.evaluation_result.accuracy = (1 - final_weighted_wer) * 100 def evaluate(self, recalculate: bool = False) -> EvaluationResult: """ Triggers the model evaluation """ - if not self.accuracy or recalculate: + if not self.evaluation_result.accuracy or recalculate: self._calculate_model_accuracy() - return EvaluationResult(self.accuracy, self.test_dataset_loader.total_samples) + return EvaluationResult( + accuracy=self.evaluation_result.accuracy, + total_test_samples=self.test_dataset_loader.total_samples + ) eval_config = {"insertion_penalty": 1, "deletion_penalty": 2, "substitution_penalty": 1} diff --git a/server/poetry.lock b/server/poetry.lock index 4c309f6c..dc5cae28 100644 --- a/server/poetry.lock +++ b/server/poetry.lock @@ -2996,4 +2996,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "a5cd48fcfc629c2cd2f4fcc4263c57f867d84acf60824eaf952e365578374d1d" +content-hash = "c9924049dacf7310590416f096f5b20f6ed905d8a50edf5e8afcf2c28b70799f" diff --git a/server/pyproject.toml b/server/pyproject.toml index 3da319bb..cdd510a0 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -51,6 +51,7 @@ aioboto3 = "^11.2.0" jiwer = "^3.0.2" levenshtein = "^0.21.1" tqdm = "^4.66.0" +pydantic = "^2.1.1" [build-system] requires = ["poetry-core"] From 992134a38bcae69dfee75eb17f4d01129311a212 Mon Sep 17 00:00:00 2001 From: Gokul Mohanarangan Date: Thu, 10 Aug 2023 14:37:39 +0530 Subject: [PATCH 6/6] minor update --- server/evaluate/evaluate_transcription.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/server/evaluate/evaluate_transcription.py b/server/evaluate/evaluate_transcription.py index c402f34e..a55c8ee4 100644 --- a/server/evaluate/evaluate_transcription.py +++ b/server/evaluate/evaluate_transcription.py @@ -49,7 +49,7 @@ class TestDatasetLoader(BaseModel): def _load_test_data(self) -> tuple[Path, Path]: """ - Loader function to validate inout files and generate samples + Loader function to validate input files and generate samples """ PREDICTED_TEST_SAMPLES_DIR = self.test_dir / "predicted_texts" REFERENCE_TEST_SAMPLES_DIR = self.test_dir / "reference_texts" @@ -116,6 +116,7 @@ class ModelEvaluator: Returns the parameters defining the evaluator """ return self.evaluation_config.model_dump() + def _normalize(self, sample: EvaluationTestSample) -> None: """ Normalize both reference and predicted text