voiceTranscription.ts 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284
  1. /**
  2. * Voice transcription helper using internal Speech-to-Text service
  3. *
  4. * Frontend implementation guide:
  5. * 1. Capture audio using MediaRecorder API
  6. * 2. Upload audio to storage (e.g., S3) to get URL
  7. * 3. Call transcription with the URL
  8. *
  9. * Example usage:
  10. * ```tsx
  11. * // Frontend component
  12. * const transcribeMutation = trpc.voice.transcribe.useMutation({
  13. * onSuccess: (data) => {
  14. * console.log(data.text); // Full transcription
  15. * console.log(data.language); // Detected language
  16. * console.log(data.segments); // Timestamped segments
  17. * }
  18. * });
  19. *
  20. * // After uploading audio to storage
  21. * transcribeMutation.mutate({
  22. * audioUrl: uploadedAudioUrl,
  23. * language: 'en', // optional
  24. * prompt: 'Transcribe the meeting' // optional
  25. * });
  26. * ```
  27. */
  28. import { ENV } from "./env";
  29. export type TranscribeOptions = {
  30. audioUrl: string; // URL to the audio file (e.g., S3 URL)
  31. language?: string; // Optional: specify language code (e.g., "en", "es", "zh")
  32. prompt?: string; // Optional: custom prompt for the transcription
  33. };
  34. // Native Whisper API segment format
  35. export type WhisperSegment = {
  36. id: number;
  37. seek: number;
  38. start: number;
  39. end: number;
  40. text: string;
  41. tokens: number[];
  42. temperature: number;
  43. avg_logprob: number;
  44. compression_ratio: number;
  45. no_speech_prob: number;
  46. };
  47. // Native Whisper API response format
  48. export type WhisperResponse = {
  49. task: "transcribe";
  50. language: string;
  51. duration: number;
  52. text: string;
  53. segments: WhisperSegment[];
  54. };
  55. export type TranscriptionResponse = WhisperResponse; // Return native Whisper API response directly
  56. export type TranscriptionError = {
  57. error: string;
  58. code: "FILE_TOO_LARGE" | "INVALID_FORMAT" | "TRANSCRIPTION_FAILED" | "UPLOAD_FAILED" | "SERVICE_ERROR";
  59. details?: string;
  60. };
  61. /**
  62. * Transcribe audio to text using the internal Speech-to-Text service
  63. *
  64. * @param options - Audio data and metadata
  65. * @returns Transcription result or error
  66. */
  67. export async function transcribeAudio(
  68. options: TranscribeOptions
  69. ): Promise<TranscriptionResponse | TranscriptionError> {
  70. try {
  71. // Step 1: Validate environment configuration
  72. if (!ENV.forgeApiUrl) {
  73. return {
  74. error: "Voice transcription service is not configured",
  75. code: "SERVICE_ERROR",
  76. details: "BUILT_IN_FORGE_API_URL is not set"
  77. };
  78. }
  79. if (!ENV.forgeApiKey) {
  80. return {
  81. error: "Voice transcription service authentication is missing",
  82. code: "SERVICE_ERROR",
  83. details: "BUILT_IN_FORGE_API_KEY is not set"
  84. };
  85. }
  86. // Step 2: Download audio from URL
  87. let audioBuffer: Buffer;
  88. let mimeType: string;
  89. try {
  90. const response = await fetch(options.audioUrl);
  91. if (!response.ok) {
  92. return {
  93. error: "Failed to download audio file",
  94. code: "INVALID_FORMAT",
  95. details: `HTTP ${response.status}: ${response.statusText}`
  96. };
  97. }
  98. audioBuffer = Buffer.from(await response.arrayBuffer());
  99. mimeType = response.headers.get('content-type') || 'audio/mpeg';
  100. // Check file size (16MB limit)
  101. const sizeMB = audioBuffer.length / (1024 * 1024);
  102. if (sizeMB > 16) {
  103. return {
  104. error: "Audio file exceeds maximum size limit",
  105. code: "FILE_TOO_LARGE",
  106. details: `File size is ${sizeMB.toFixed(2)}MB, maximum allowed is 16MB`
  107. };
  108. }
  109. } catch (error) {
  110. return {
  111. error: "Failed to fetch audio file",
  112. code: "SERVICE_ERROR",
  113. details: error instanceof Error ? error.message : "Unknown error"
  114. };
  115. }
  116. // Step 3: Create FormData for multipart upload to Whisper API
  117. const formData = new FormData();
  118. // Create a Blob from the buffer and append to form
  119. const filename = `audio.${getFileExtension(mimeType)}`;
  120. const audioBlob = new Blob([new Uint8Array(audioBuffer)], { type: mimeType });
  121. formData.append("file", audioBlob, filename);
  122. formData.append("model", "whisper-1");
  123. formData.append("response_format", "verbose_json");
  124. // Add prompt - use custom prompt if provided, otherwise generate based on language
  125. const prompt = options.prompt || (
  126. options.language
  127. ? `Transcribe the user's voice to text, the user's working language is ${getLanguageName(options.language)}`
  128. : "Transcribe the user's voice to text"
  129. );
  130. formData.append("prompt", prompt);
  131. // Step 4: Call the transcription service
  132. const baseUrl = ENV.forgeApiUrl.endsWith("/")
  133. ? ENV.forgeApiUrl
  134. : `${ENV.forgeApiUrl}/`;
  135. const fullUrl = new URL(
  136. "v1/audio/transcriptions",
  137. baseUrl
  138. ).toString();
  139. const response = await fetch(fullUrl, {
  140. method: "POST",
  141. headers: {
  142. authorization: `Bearer ${ENV.forgeApiKey}`,
  143. "Accept-Encoding": "identity",
  144. },
  145. body: formData,
  146. });
  147. if (!response.ok) {
  148. const errorText = await response.text().catch(() => "");
  149. return {
  150. error: "Transcription service request failed",
  151. code: "TRANSCRIPTION_FAILED",
  152. details: `${response.status} ${response.statusText}${errorText ? `: ${errorText}` : ""}`
  153. };
  154. }
  155. // Step 5: Parse and return the transcription result
  156. const whisperResponse = await response.json() as WhisperResponse;
  157. // Validate response structure
  158. if (!whisperResponse.text || typeof whisperResponse.text !== 'string') {
  159. return {
  160. error: "Invalid transcription response",
  161. code: "SERVICE_ERROR",
  162. details: "Transcription service returned an invalid response format"
  163. };
  164. }
  165. return whisperResponse; // Return native Whisper API response directly
  166. } catch (error) {
  167. // Handle unexpected errors
  168. return {
  169. error: "Voice transcription failed",
  170. code: "SERVICE_ERROR",
  171. details: error instanceof Error ? error.message : "An unexpected error occurred"
  172. };
  173. }
  174. }
  175. /**
  176. * Helper function to get file extension from MIME type
  177. */
  178. function getFileExtension(mimeType: string): string {
  179. const mimeToExt: Record<string, string> = {
  180. 'audio/webm': 'webm',
  181. 'audio/mp3': 'mp3',
  182. 'audio/mpeg': 'mp3',
  183. 'audio/wav': 'wav',
  184. 'audio/wave': 'wav',
  185. 'audio/ogg': 'ogg',
  186. 'audio/m4a': 'm4a',
  187. 'audio/mp4': 'm4a',
  188. };
  189. return mimeToExt[mimeType] || 'audio';
  190. }
  191. /**
  192. * Helper function to get full language name from ISO code
  193. */
  194. function getLanguageName(langCode: string): string {
  195. const langMap: Record<string, string> = {
  196. 'en': 'English',
  197. 'es': 'Spanish',
  198. 'fr': 'French',
  199. 'de': 'German',
  200. 'it': 'Italian',
  201. 'pt': 'Portuguese',
  202. 'ru': 'Russian',
  203. 'ja': 'Japanese',
  204. 'ko': 'Korean',
  205. 'zh': 'Chinese',
  206. 'ar': 'Arabic',
  207. 'hi': 'Hindi',
  208. 'nl': 'Dutch',
  209. 'pl': 'Polish',
  210. 'tr': 'Turkish',
  211. 'sv': 'Swedish',
  212. 'da': 'Danish',
  213. 'no': 'Norwegian',
  214. 'fi': 'Finnish',
  215. };
  216. return langMap[langCode] || langCode;
  217. }
  218. /**
  219. * Example tRPC procedure implementation:
  220. *
  221. * ```ts
  222. * // In server/routers.ts
  223. * import { transcribeAudio } from "./_core/voiceTranscription";
  224. *
  225. * export const voiceRouter = router({
  226. * transcribe: protectedProcedure
  227. * .input(z.object({
  228. * audioUrl: z.string(),
  229. * language: z.string().optional(),
  230. * prompt: z.string().optional(),
  231. * }))
  232. * .mutation(async ({ input, ctx }) => {
  233. * const result = await transcribeAudio(input);
  234. *
  235. * // Check if it's an error
  236. * if ('error' in result) {
  237. * throw new TRPCError({
  238. * code: 'BAD_REQUEST',
  239. * message: result.error,
  240. * cause: result,
  241. * });
  242. * }
  243. *
  244. * // Optionally save transcription to database
  245. * await db.insert(transcriptions).values({
  246. * userId: ctx.user.id,
  247. * text: result.text,
  248. * duration: result.duration,
  249. * language: result.language,
  250. * audioUrl: input.audioUrl,
  251. * createdAt: new Date(),
  252. * });
  253. *
  254. * return result;
  255. * }),
  256. * });
  257. * ```
  258. */