import { recognize } from 'tesseract.js';
import * as pdfjsLib from 'pdfjs-dist/webpack.mjs';

// pdfjsLib.GlobalWorkerOptions.workerSrc = `https://cdn.jsdelivr.net/npm/pdfjs-dist@4.0.379/+esm`;

export async function extractTextFromFiles(files){
  const fileTexts = await Promise.all(Array.from(files).map(processFile));
  return fileTexts.join('\n');
}

const processFile = async (file) => {
  try {
    const arrayBuffer = await file.arrayBuffer();

    switch (file.type) {
      case 'application/pdf':
        return new Promise((resolve, reject) => {
          const fileReader = new FileReader();
          fileReader.onload = async (event) => {
            try {
              const typedArray = new Uint8Array(event?.target?.result);
              const pdfDocument = await pdfjsLib.getDocument(typedArray).promise;
              let text = '';
              for (let i = 1; i <= pdfDocument.numPages; i++) {
                const page = await pdfDocument.getPage(i);
                const textContent = await page.getTextContent();
                text += textContent.items.map(item => item.str).join(' ') + ' ';
              }
              resolve(text);
            } catch (error) {
              reject(error);
            }
          };
          fileReader.onerror = reject;
          fileReader.readAsArrayBuffer(file);
        });
      case 'text/plain':
        return new TextDecoder("utf-8").decode(arrayBuffer);
      case 'image/jpeg':
      case 'image/png':
      case 'image/heic':
        // Convert the File to a Blob URL
        const imageUrl = URL.createObjectURL(file);
        try {
          const { data: { text } } = await recognize(imageUrl, 'eng');
          URL.revokeObjectURL(imageUrl); 
          return text;
        } catch (error) {
          URL.revokeObjectURL(imageUrl); 
          throw error;
        }
      default:
        throw new Error('Unsupported file type');
    }
  } catch (error) {
    alert(`Failed to read the uploaded file:\n${error}`);
    return '';
  }
};

function extractInnerTextWithSpaces(node) {
  let result = '';

  if (node.nodeType === Node.TEXT_NODE) {
    result += node.textContent || '';
  } else if (node.nodeType === Node.ELEMENT_NODE) {
    const children = Array.from(node.childNodes);

    for (const child of children) {
      result += extractInnerTextWithSpaces(child);
    }

    // Add space between block-level elements
    if (node instanceof HTMLElement && /^(div|p|h[1-6]|ul|ol|li)$/i.test(node.tagName)) {
      result += ' ';
    }
  }

  return result;
}

export function extractInnerTextFromHTML(htmlString) {
  const parser = new DOMParser();
  const doc = parser.parseFromString(htmlString, 'text/html');

  return extractInnerTextWithSpaces(doc.body);
}