using System.Collections;
using UnityEngine;
using UnityEngine.Networking;
using System;

namespace NTTQONOQ.Android.MiRZA.Library.Samples.GPTImageAnalyzerSample
{
    // テキストから音声に変換する
    [RequireComponent(typeof(AudioSource))]
    public class GoogleTextToSpeechConverter : MonoBehaviour
    {
        // Google Cloud PlatformのAPIキー
        // _gcpApiKeyTextを設定済みの場合は必要無し
        [SerializeField] private string _gcpApiKey;

        // _gcpApiKeyを設定済みの場合は必要無し
        [SerializeField] private TextAsset _gcpApiKeyText;

        private string _url;

        // 音声出力用
        private AudioSource _audioSource;

        [Serializable]
        private class SynthesisInput
        {
            public string text;
        }

        [Serializable]
        private class VoiceSelectionParams
        {
            // 日本語の指定
            public string languageCode = "ja-JP";
            public string name;
        }

        [Serializable]
        private class AudioConfig
        {
            public string audioEncoding = "LINEAR16";
            public int speakingRate = 1;
            public int pitch = 0;
            public int sampleRateHertz = 16000;
        }

        [Serializable]
        private class SynthesisRequest
        {
            public SynthesisInput input;
            public VoiceSelectionParams voice;
            public AudioConfig audioConfig;
        }

        [Serializable]
        private class SynthesisResponse
        {
            public string audioContent;
        }

        private void Start()
        {
            _audioSource = GetComponent<AudioSource>();

            // APIキーのファイルが設定されている場合は、そのAPIキーを使用する
            if (_gcpApiKeyText)
            {
                _gcpApiKey = _gcpApiKeyText.text;
            }
            if (_gcpApiKey == null)
            {
                return;
            }

            _url = "https://texttospeech.googleapis.com/v1/text:synthesize?key=" + _gcpApiKey;

            SynthesizeAndPlay("音声の出力機能は有効です");
        }

        public void SynthesizeAndPlay(string text)
        {
            StartCoroutine(Synthesize(text));
        }

        private IEnumerator Synthesize(string text)
        {
            var requestData = new SynthesisRequest
            {
                input = new SynthesisInput { text = text },
                voice = new VoiceSelectionParams { languageCode = "ja-JP", name = "ja-JP-Neural2-B" },
                audioConfig = new AudioConfig { audioEncoding = "LINEAR16", speakingRate = 1, pitch = 0, sampleRateHertz = 16000 }
            };

            using var www = new UnityWebRequest(_url, "POST");
            var bodyRaw = System.Text.Encoding.UTF8.GetBytes(JsonUtility.ToJson(requestData));
            www.uploadHandler = new UploadHandlerRaw(bodyRaw);
            www.downloadHandler = new DownloadHandlerBuffer();
            www.SetRequestHeader("Content-Type", "application/json");
            yield return www.SendWebRequest();

            if (www.result == UnityWebRequest.Result.Success)
            {
                var response = www.downloadHandler.text;
                var synthesisResponse = JsonUtility.FromJson<SynthesisResponse>(response);
                PlayAudioFromBase64(synthesisResponse.audioContent);
            }
            else
            {
                Debug.LogError("Google Text-to-Speech Error: " + www.error);
            }
        }

        private void PlayAudioFromBase64(string base64AudioData)
        {
            var audioBytes = System.Convert.FromBase64String(base64AudioData);
            LoadAudioClipAndPlay(audioBytes);
        }

        private void LoadAudioClipAndPlay(byte[] audioData)
        {
            var sampleRate = 16000;
            var channels = 1;

            var samplesCount = audioData.Length / 2;
            var audioFloatData = new float[samplesCount];

            for (var i = 0; i < samplesCount; i++)
            {
                short sampleInt = BitConverter.ToInt16(audioData, i * 2);
                audioFloatData[i] = sampleInt / 32768.0f;
            }

            var clip = AudioClip.Create("SynthesizedSpeech", samplesCount, channels, sampleRate, false);
            clip.SetData(audioFloatData, 0);

            _audioSource.clip = clip;
            _audioSource.Play();
        }
    }
}
