Merge pull request #204 from emcifuntik:master

Speech to text transcription
This commit is contained in:
Eduard Kuzmenko 2023-02-19 23:15:34 +04:00 committed by GitHub
commit 7ff73c2d7f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 200 additions and 8 deletions

View File

@ -163,6 +163,13 @@ async function wrapVoiceMessage(audioEl: AudioElement) {
const {svg, container: svgContainer, availW} = createWaveformBars(waveform, doc.duration);
const audioControlsDiv = document.createElement('div');
audioControlsDiv.classList.add('audio-controls');
const audioTimelineDiv = document.createElement('div');
audioTimelineDiv.classList.add('audio-timeline');
audioControlsDiv.append(audioTimelineDiv);
const fakeSvgContainer = svgContainer.cloneNode(true) as HTMLElement;
fakeSvgContainer.classList.add('audio-waveform-fake');
svgContainer.classList.add('audio-waveform-background');
@ -173,7 +180,67 @@ async function wrapVoiceMessage(audioEl: AudioElement) {
const timeDiv = document.createElement('div');
timeDiv.classList.add('audio-time');
audioEl.append(waveformContainer, timeDiv);
audioTimelineDiv.append(waveformContainer, timeDiv);
audioEl.append(audioControlsDiv);
const isPremium: boolean = rootScope.premium;
if (isPremium) {
const speechRecognitionDiv = document.createElement('div');
speechRecognitionDiv.classList.add('audio-to-text-button');
const speechRecognitionIcon = document.createElement('span');
speechRecognitionIcon.innerHTML = '→A';
const speechRecognitionLoader = document.createElement('div');
speechRecognitionLoader.classList.add('loader');
speechRecognitionLoader.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 32 24"><style></style><rect fill="transparent" stroke-width="3" stroke-linejoin="round" rx="6" ry="6" stroke="var(--message-out-primary-color)" stroke-dashoffset="1" stroke-dasharray="32,68" width="32" height="24"></rect></svg>'
speechRecognitionDiv.append(speechRecognitionIcon, speechRecognitionLoader);
const speechTextDiv = document.createElement('div');
speechTextDiv.innerHTML = '';
speechTextDiv.classList.add('audio-to-text');
speechTextDiv.style.display = 'none';
speechRecognitionDiv.onclick = async () => {
if (audioEl.transcriptionState == 0) {
if (speechTextDiv.innerHTML !== '') {
speechTextDiv.style.display = 'block';
speechRecognitionIcon.innerHTML = '^';
//TODO: State to enum
audioEl.transcriptionState = 2;
} else {
audioEl.transcriptionState = 1;
speechRecognitionLoader.classList.add('active');
let transcription;
try {
transcription = await audioEl.managers.appMessagesManager.transcribeAudio(message);
} catch(err) {
speechRecognitionLoader.classList.remove('active');
audioEl.transcriptionState = 0;
return;
}
if (transcription.pFlags.pending === true) {
return;
}
speechTextDiv.innerHTML = transcription.text;
speechTextDiv.style.display = 'block';
speechRecognitionIcon.innerHTML = '^';
speechRecognitionLoader.classList.remove('active');
audioEl.transcriptionState = 2;
}
} else if (audioEl.transcriptionState == 2) {
//Hide transcription
speechRecognitionIcon.innerHTML = '→A';
speechTextDiv.style.display = 'none';
audioEl.transcriptionState = 0;
}
};
audioControlsDiv.append(speechRecognitionDiv);
audioEl.append(speechTextDiv);
}
let progress = svg as any as HTMLElement;
@ -426,6 +493,7 @@ export default class AudioElement extends HTMLElement {
public lazyLoadQueue: LazyLoadQueue;
public loadPromises: Promise<any>[];
public managers: AppManagers;
public transcriptionState: number = 0;
private listenerSetter = new ListenerSetter();
private onTypeDisconnect: () => void;

View File

@ -620,6 +620,29 @@ export default class ChatBubbles {
bubble.classList.add('is-error');
});
this.listenerSetter.add(rootScope)('message_transcribed', async({peerId, mid, text}) => {
console.log(peerId, mid, text);
if(peerId !== this.peerId) return;
const bubble = this.bubbles[mid];
if(!bubble) return;
//TODO: Move it to AudioElement method `finishVoiceTranscription`
const audioElement = bubble.querySelector('audio-element') as AudioElement;
if (audioElement) {
const speechTextDiv = audioElement.querySelector('.audio-to-text') as HTMLElement;
const speechRecognitionIcon = audioElement.querySelector('.audio-to-text-button span');
const speechRecognitionLoader = audioElement.querySelector('.loader');
if (speechTextDiv && speechRecognitionIcon) {
speechTextDiv.innerHTML = text;
speechTextDiv.style.display = 'block';
speechRecognitionIcon.innerHTML = '^';
speechRecognitionLoader.classList.remove('active');
audioElement.transcriptionState = 2;
}
}
});
this.listenerSetter.add(rootScope)('album_edit', ({peerId, messages, deletedMids}) => {
if(peerId !== this.peerId) return;

View File

@ -17,7 +17,7 @@ import LazyLoadQueueBase from '../../components/lazyLoadQueueBase';
import deferredPromise, {CancellablePromise} from '../../helpers/cancellablePromise';
import tsNow from '../../helpers/tsNow';
import {randomLong} from '../../helpers/random';
import {Chat, ChatFull, Dialog as MTDialog, DialogPeer, DocumentAttribute, InputMedia, InputMessage, InputPeerNotifySettings, InputSingleMedia, Message, MessageAction, MessageEntity, MessageFwdHeader, MessageMedia, MessageReplies, MessageReplyHeader, MessagesDialogs, MessagesFilter, MessagesMessages, MethodDeclMap, NotifyPeer, PeerNotifySettings, PhotoSize, SendMessageAction, Update, Photo, Updates, ReplyMarkup, InputPeer, InputPhoto, InputDocument, InputGeoPoint, WebPage, GeoPoint, ReportReason, MessagesGetDialogs, InputChannel, InputDialogPeer, ReactionCount, MessagePeerReaction, MessagesSearchCounter, Peer, MessageReactions, Document, InputFile, Reaction, ForumTopic as MTForumTopic, MessagesForumTopics, MessagesGetReplies, MessagesGetHistory, MessagesAffectedHistory, UrlAuthResult} from '../../layer';
import {Chat, ChatFull, Dialog as MTDialog, DialogPeer, DocumentAttribute, InputMedia, InputMessage, InputPeerNotifySettings, InputSingleMedia, Message, MessageAction, MessageEntity, MessageFwdHeader, MessageMedia, MessageReplies, MessageReplyHeader, MessagesDialogs, MessagesFilter, MessagesMessages, MethodDeclMap, NotifyPeer, PeerNotifySettings, PhotoSize, SendMessageAction, Update, Photo, Updates, ReplyMarkup, InputPeer, InputPhoto, InputDocument, InputGeoPoint, WebPage, GeoPoint, ReportReason, MessagesGetDialogs, InputChannel, InputDialogPeer, ReactionCount, MessagePeerReaction, MessagesSearchCounter, Peer, MessageReactions, Document, InputFile, Reaction, ForumTopic as MTForumTopic, MessagesForumTopics, MessagesGetReplies, MessagesGetHistory, MessagesAffectedHistory, UrlAuthResult, MessagesTranscribedAudio} from '../../layer';
import {ArgumentTypes, InvokeApiOptions} from '../../types';
import {logger, LogTypes} from '../logger';
import {ReferenceContext} from '../mtproto/referenceDatabase';
@ -313,7 +313,9 @@ export class AppMessagesManager extends AppManager {
updateDeleteScheduledMessages: this.onUpdateDeleteScheduledMessages,
updateMessageExtendedMedia: this.onUpdateMessageExtendedMedia
updateMessageExtendedMedia: this.onUpdateMessageExtendedMedia,
updateTranscribedAudio: this.onUpdateTranscribedAudio
});
// ! Invalidate notify settings, can optimize though
@ -522,6 +524,27 @@ export class AppMessagesManager extends AppManager {
});
}
public async transcribeAudio(message: any): Promise<MessagesTranscribedAudio> {
console.log('Method called');
const {id, peerId} = message;
let promise: Promise<MessagesTranscribedAudio>, params: any;
if(peerId) {
promise = this.apiManager.invokeApiSingleProcess({
method: 'messages.transcribeAudio',
params: params = {
peer: this.appPeersManager.getInputPeerById(peerId),
msg_id: id
},
processResult: (result) => {
console.log(result);
return result;
}
});
}
return promise;
}
public async sendText(peerId: PeerId, text: string, options: MessageSendingParams & Partial<{
entities: MessageEntity[],
viaBotId: BotId,
@ -5248,6 +5271,16 @@ export class AppMessagesManager extends AppManager {
});
};
private onUpdateTranscribedAudio = (update: Update.updateTranscribedAudio) => {
if (update.pFlags.pending === true) return;
const peerId = this.appPeersManager.getPeerId(update.peer);
const text = update.text;
const mid = generateMessageId(update.msg_id);
this.rootScope.dispatchEvent('message_transcribed', {peerId, mid, text});
};
public setDialogToStateIfMessageIsTop(message: MyMessage) {
if(this.isMessageIsTopMessage(message)) {
this.dialogsStorage.setDialogToState(this.getDialogOnly(message.peerId));

View File

@ -73,6 +73,7 @@ export type BroadcastEvents = {
'message_edit': {storageKey: MessagesStorageKey, peerId: PeerId, mid: number, message: MyMessage},
'message_sent': {storageKey: MessagesStorageKey, tempId: number, tempMessage: any, mid: number, message: MyMessage},
'message_error': {storageKey: MessagesStorageKey, tempId: number, error: ApiError},
'message_transcribed': {peerId: PeerId, mid: number, text: string},
'messages_views': {peerId: PeerId, mid: number, views: number}[],
'messages_reactions': {message: Message.message, changedResults: ReactionCount[]}[],
'messages_pending': void,

View File

@ -428,7 +428,7 @@
// &.audio-48 {
--icon-size: 3rem;
--icon-margin: .5625rem;
height: var(--icon-size);
min-height: var(--icon-size);
.audio-details {
margin-top: 3px;
@ -439,6 +439,66 @@
margin-bottom: -2px;
}
.audio-controls {
display: flex;
align-items: flex-start;
gap: .25em;
.audio-to-text-button {
background: var(--message-transcribe-button);
width: max-content;
margin-top: 0.2rem;
font-size: .8em;
border-radius: 0.3rem;
width: 2rem;
height: 1.5rem;
display: flex;
align-items: center;
justify-content: center;
position: relative;
overflow: hidden;
span {
line-height: 0;
letter-spacing: .15em;
}
.loader {
position: absolute;
top: 0;
left: 0;
width: 100%;
height: 100%;
opacity: 0;
transition: opacity .2s;
&.active {
opacity: 1;
}
svg {
width: 100%;
height: 100%;
@keyframes loading {
from { stroke-dashoffset: 100 }
to { stroke-dashoffset: 0 }
}
rect {
animation: 1s ease-in-out loading infinite;
}
}
}
}
}
.audio-to-text {
margin-left: calc(var(--icon-size)*-1 - var(--icon-margin) - var(--padding));
margin-top: 1em;
margin-bottom: 0.6em;
}
.part {
height: 112px !important;
width: 112px !important;
@ -584,4 +644,4 @@
}
}
}
}
}

View File

@ -1411,7 +1411,7 @@ $bubble-border-radius-big: 12px;
}
@include respond-to(handhelds) {
height: 2.375rem;
min-height: 2.375rem;
--icon-margin: .6875rem;
.audio-details {
@ -1440,7 +1440,7 @@ $bubble-border-radius-big: 12px;
.message.audio-message,
.message.voice-message {
// width: 335px;
max-width: unquote("min(100%, 335px)") !important;
max-width: unquote("min(100%, 364px)") !important;
@include respond-to(handhelds) {
// width: 280px;
@ -2718,6 +2718,10 @@ $bubble-border-radius-big: 12px;
color: var(--message-out-primary-color);
}
.audio-to-text-button {
background: var(--message-transcribe-button-out);
}
/* html:not(.is-firefox) */ &-wrapper {
@include respond-to(medium-screens) {
transform: scale(1) translateX(calc((var(--chat-input-size) + #{$btn-send-margin}) * -1));

View File

@ -202,7 +202,6 @@
padding-inline-start: var(--padding-left);
display: flex;
flex-direction: column;
justify-content: center;
cursor: pointer;
// position: relative;
user-select: none;

View File

@ -256,6 +256,8 @@ $chat-input-inner-padding-handhelds: .25rem;
--light-filled-message-primary-color: var(--light-filled-primary-color);
--message-secondary-color: var(--secondary-color);
--message-error-color: var(--danger-color);
--message-transcribe-button: #e8f3ff;
--message-transcribe-button-out: #cceebf;
--message-out-link-color: var(--link-color);
--message-out-status-color: var(--message-out-primary-color);
@ -322,6 +324,8 @@ $chat-input-inner-padding-handhelds: .25rem;
--message-checkbox-border-color: #fff;
--message-secondary-color: var(--secondary-color);
--message-error-color: #fff;
--message-transcribe-button: #2a2a3c;
--message-transcribe-button-out: #8373d3;
--message-out-link-color: #fff;
--message-out-status-color: #fff;