ThisIsCZ

Integrating AI models teaches you how to work with third-party APIs, handle natural language generation tasks, and optimize request/response management to enable intelligent conversation features.

1. Chat Logic

Integrate the Deepseek chat model. I originally wanted to use OpenAI, but it doesn't support registration with Chinese credit cards, so I reluctantly spent 10 yuan to recharge Deepseek—so far, my only expense.
Input: voice and text; Output: text converted to audio playback
Native browser support: SpeechRecognition for voice-to-text, SpeechSynthesisUtterance for text-to-speech

2. API Design

Create a Deepseek API token and obtain DEEPSEEK_API_KEY
Design the endpoint: /api/callGpt, refer to the official example

// /api/callGpt/route.ts
import { NextResponse } from 'next/server'

export async function POST(req: Request) {
  const { message } = await req.json()

  const deepseekRes = await fetch('https://api.deepseek.com/chat/completions', {
    method: 'POST',
    headers: {
      'Content-Type': 'application/json',
      Authorization: `Bearer ${process.env.DEEPSEEK_API_KEY!}`,
    },
    body: JSON.stringify({
      model: 'deepseek-chat',
      stream: false,
      messages: [
        {
          role: 'user',
          content: message,
        },
      ],
    }),
  })

  if (!deepseekRes.ok) {
    return NextResponse.json(
      { error: 'Failed to fetch from DeepSeek API' },
      { status: 500 },
    )
  }

  const result = await deepseekRes.json()
  const text = result.choices?.[0]?.message?.content || ''

  return NextResponse.json({ text })
}

3. Frontend Design

recognition.continuous = true: finish after a complete sentence, not instant response
The speak function converts text to audio. Browser compatibility is average; it may not work on mobile devices

// aiTalk/page.tsx
'use client'
import MarkdownView from '@/components/MarkdownView'
import { Button } from '@/components/ui/button'
import { Input } from '@/components/ui/input'
import { ChevronRight } from 'lucide-react'
import { useLocale, useTranslations } from 'next-intl'
import { useEffect, useRef, useState } from 'react'
export default function PageAiTalk() {
  const t = useTranslations('PageAiTalk')
  const locale = useLocale()
  const recognitionRef = useRef<SpeechRecognition | null>(null)
  const synthRef = useRef<SpeechSynthesisUtterance | null>(null)
  const chatContainerRef = useRef<HTMLDivElement>(null)

  const [message, setMessage] = useState('')
  const [response, setResponse] = useState('')
  const [isLoading, setIsLoading] = useState(false)
  const [chatHistory, setChatHistory] = useState<
    { message: string; response: string }[]
  >([])
  const [inputMode, setInputMode] = useState<'voice' | 'text'>('voice')

  const speak = (text: string) => {
    if (!window.speechSynthesis) return
    window.speechSynthesis.cancel()

    const utterance = new SpeechSynthesisUtterance(text)
    utterance.lang = locale === 'zh' ? 'zh-CN' : 'en-US'
    utterance.rate = 1.1
    utterance.pitch = 1

    const voices = speechSynthesis.getVoices()
    const matchedVoice = voices.find((v) =>
      locale === 'zh' ? v.lang.includes('zh') : v.lang.includes('en'),
    )
    if (matchedVoice) utterance.voice = matchedVoice

    synthRef.current = utterance
    window.speechSynthesis.speak(utterance)
  }

  const fetchChatGPT = async (text: string) => {
    setIsLoading(true)
    try {
      const res = await fetch('/api/callGpt', {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify({ message: text }),
      })
      const data = await res.json()
      const fullReply = data.text
      setResponse(fullReply)
      speak(fullReply)

      setChatHistory((prev) => [
        ...prev,
        { message: text, response: fullReply },
      ])
      setMessage('')
    } catch (error) {
      console.error('Error:', error)
    } finally {
      setIsLoading(false)
    }
  }

  useEffect(() => {
    if (
      !('webkitSpeechRecognition' in window || 'SpeechRecognition' in window)
    ) {
      alert(t('browserNotSupported'))
      return
    }

    const SpeechRecognition =
      window.SpeechRecognition || window.webkitSpeechRecognition
    const recognition = new SpeechRecognition() as any
    recognition.lang = locale === 'zh' ? 'zh-CN' : 'en-US'
    recognition.interimResults = false
    recognition.continuous = true

    recognition.onstart = () => setMessage(`🎤 ${t('listening')}`)
    recognition.onresult = (event: any) => {
      const transcript = event.results[0][0].transcript
      console.log('Recognized text:', transcript)
      setMessage(transcript)
      fetchChatGPT(transcript)
    }
    recognition.onerror = (e: any) => {
      console.error('Recognition error:', e)
      setMessage(`❌ ${t('recognitionError')}`)
    }
    recognition.onend = () => {
      console.log('Recognition ended')
      setMessage('')
    }

    recognitionRef.current = recognition
    return () => recognition.stop()
  }, [locale])

  useEffect(() => {
    if (chatContainerRef.current) {
      chatContainerRef.current.scrollTop = chatContainerRef.current.scrollHeight
    }
  }, [chatHistory])

  const holdTimer = useRef<NodeJS.Timeout | null>(null)

  const handlePressStart = (e: any) => {
    e.preventDefault()
    holdTimer.current = setTimeout(() => {
      if (!recognitionRef.current) return
      console.log('Start recognition')
      recognitionRef.current.start()
    }, 200)
  }

  const handlePressEnd = (e: any) => {
    e.preventDefault()
    clearTimeout(holdTimer.current as NodeJS.Timeout)
    if (!recognitionRef.current) return
    console.log('Stop recognition')
    recognitionRef.current.stop()
  }

  return (
    <div className="page-wrapper py-6">
      <div className="mx-auto max-w-[680px]">
        <h1 className="mb-4 text-center text-xl font-bold">🎙️ {t('title')}</h1>

        <div className="mb-4 flex w-full justify-center">
          <Button
            variant="outline"
            onClick={() =>
              setInputMode((prev) => (prev === 'voice' ? 'text' : 'voice'))
            }
          >
            {t('currentMode')}: {inputMode === 'voice' ? t('voice') : t('text')}
            <ChevronRight className="size-4" />
          </Button>
        </div>

        <div
          ref={chatContainerRef}
          className="md:[60vh] bg-muted mb-4 h-[calc(100vh-20rem)] overflow-y-auto rounded-lg p-4"
        >
          <ul className="space-y-3">
            {chatHistory.map((chat, index) => (
              <li key={index} className="bg-background rounded-lg p-3 shadow">
                <p className="font-semibold">
                  {t('yourMessage')}:
                  <span className="font-normal">{chat.message}</span>
                </p>
                <div className="">
                  <div className="font-semibold">AI:</div>
                  <div className="">
                    <MarkdownView content={chat.response} />
                  </div>
                </div>
              </li>
            ))}
          </ul>
          {isLoading && (
            <div className="mt-4 flex items-center justify-center gap-1">
              <div className="inline-flex space-x-2">
                <div className="bg-muted-foreground size-1 animate-bounce rounded-full [animation-delay:-0.3s]"></div>
                <div className="bg-muted-foreground size-1 animate-bounce rounded-full [animation-delay:-0.15s]"></div>
                <div className="bg-muted-foreground size-1 animate-bounce rounded-full"></div>
              </div>
            </div>
          )}
        </div>

        <div className="flex w-full justify-center">
          {inputMode === 'voice' ? (
            <Button
              className="bg-foreground active:bg-foreground/80 w-full cursor-pointer select-none rounded-lg py-2 font-bold transition duration-200 md:w-[50%]"
              onMouseDown={handlePressStart}
              onMouseUp={handlePressEnd}
              onTouchStart={handlePressStart}
              onTouchEnd={handlePressEnd}
              size="lg"
            >
              {t('pressAndSpeak')}
            </Button>
          ) : (
            <div className="flex w-full items-center gap-2">
              <Input
                type="text"
                className="h-10 flex-1 px-4"
                placeholder={t('enterText')}
                value={message}
                onChange={(e) => setMessage(e.target.value)}
              />
              <Button
                className="cursor-pointer px-4 py-2"
                size="lg"
                onClick={() => {
                  if (message.trim()) {
                    fetchChatGPT(message.trim())
                  }
                }}
              >
                {t('send')}
              </Button>
            </div>
          )}
        </div>
      </div>
    </div>
  )
}

1. Chat Logic

2. API Design

3. Frontend Design

Comments