Fix extra space between some tokens (punctuations) (#267)

* ensure uptime for reflector.media

* remove extra space before punct

* update detokenizer method

* create detokenizer property

* merge conflict
This commit is contained in:
projects-g
2023-10-12 10:42:19 +05:30
committed by GitHub
parent 47f7e1836e
commit bbe63ad407

View File

@@ -109,6 +109,7 @@ class LLM:
self.m_generate_call = self.m_generate_call.labels(name)
self.m_generate_success = self.m_generate_success.labels(name)
self.m_generate_failure = self.m_generate_failure.labels(name)
self.detokenizer = nltk.tokenize.treebank.TreebankWordDetokenizer()
@property
def tokenizer(self):
@@ -193,15 +194,11 @@ class LLM:
camel_cased.append(word[0].upper() + word[1:])
else:
camel_cased.append(word)
modified_title = " ".join(camel_cased)
modified_title = self.detokenizer.detokenize(camel_cased)
# The result can have words in braces with additional space.
# Change ( ABC ), [ ABC ], etc. ==> (ABC), [ABC], etc.
pattern = r"(?<=[\[\{\(])\s+|\s+(?=[\]\}\)])"
title = re.sub(pattern, "", modified_title)
# Irrespective of casing changes, the starting letter
# of title is always upper-cased
title = title[0].upper() + title[1:]
title = modified_title[0].upper() + modified_title[1:]
except Exception as e:
reflector_logger.info(
f"Failed to ensure casing on {title=} " f"with exception : {str(e)}"