import { LAMBDA_BASE_URL } from "API/lambdas/index";

const documentExtractText = async (
  docUrl,
  formatType = "text",
  mergePages = false,
  pageLimit = 1
) => {
  let text = [];

  let maxPage = formatType === "text" ? 1 : pageLimit;

  // Get the text from the document using the lambda get-text
  const response = await fetch(
    `${LAMBDA_BASE_URL}/pdf-data-extractor-prod-main?docUrl=${docUrl}&formatType=${formatType}&maxPage=${maxPage}`
  );

  const data = await response.json();

  if (response.status !== 200) {
    return formatType == "text" ? "" : null;
  }

  // Get the text from the document
  /**
   * Sample response:
   * "{\"content\": [{\"method\": \"mupdf\", \"en_text\": \"KTH Nada, Linda Kann\\n2D1320, EXAM IN APPLIED COMPUTER SCIENCE\\nTuesday, 19 October 2004, 8-13 p.m.\\nMax score = 50. Grade limits: 25 points gives third, 35 gives four, 45 gives fifth. upshot\\nis appropriated in about two weeks on the Nadas message board on Osquar Hill 2, Plan 3. Assistive products: A\\nalgorithm book and the formula collection.\\n1.\\nBird Eat\\nConstruct and draw up a KMP vending machine that searches for the word TRASTRAP. Enter\\n(4p)\\nalso the next-vector, and then show how
   */

  // If it's XML, return it as a
  if (formatType === "xml") {
    const xml = new DOMParser().parseFromString(data, "text/xml");
    return xml;
  }

  // If it's text, return it as an array, merge all pages

  if (!data || data.length === 0) {
    return "";
  }

  text = data.reduce((acc, page) => {
    return [...acc, page.text];
  }, []);

  if (mergePages) {
    return text.join(" ");
  }

  return text;
};

export default documentExtractText;
