import {useState} from 'react'
import Box from '@mui/material/Box'
import Modal from '@mui/material/Modal'
import Stack from '@mui/material/Stack'
import Typography from '@mui/material/Typography'
import Wizard, {StepProps} from 'components/forms/Wizard'
import {BasicSettings, CustomCode, TasksUploader} from 'components/evaluations/configure'
import {modalStyle} from 'components/dialogs'
import {taskTypes} from 'components/filters/constants'
import {ExternalA} from 'components/text'
import {Evaluation as EvaluationType, TaskType} from '@equistamp/types'
import {
  BOOLEAN_QUESTION,
  FREE_RESPONSE_QUESTION,
  JSON_QUESTION,
  MULTIPLE_CHOICE_QUESTION,
} from '@equistamp/constants'

const TasksOverwriteInfo = () => (
  <Typography>
    This wizard helps you choose and configure a CSV file containing tasks for this evaluation. All
    tasks from the CSV file will be added, without checking for duplicates. This means that if your
    CSV file contains tasks that already exist for this evaluation, there will be multiple copies of
    that task. Which in turn means that it's a lot more likely for that task to be sent to models
    when they are being evaluated.
  </Typography>
)

const documentationItems = {
  settings: [
    {
      title: 'What is an evaluation?',
      contents: `
            An evaluation is a collection of tasks for AI models to solve, along with a
            method to grade their responses (i.e. a grader). The result of running an evaluation on a model
            is a score, which is basically the percentage of correct tasks (the precise meaning depends
            on the task types and grader used).`,
    },
    {
      title: 'Public vs private evaluations',
      contents: [
        'The actual tasks of an evaluation are always confidential. Only you, and users that you explicitly give access to can view them.',
        'Evaluations are by default public, meaning that anyone can run a model on them or can view the results of previously ran models.',
      ],
    },
  ],
  tasks: [
    {
      title: 'How do I choose a test format?',
      contents: (
        <ul>
          <Stack spacing={2}>
            <li>
              Multiple Choice Questions - these are good for testing knowledge across a broad range
              of topics quickly. Works well when you want to check if someone can identify the
              correct answer among several options
            </li>
            <li>
              Boolean Questions - best for verifying a clear understanding of facts. These are great
              for straightforward, fundamental concepts where there's no middle ground or ambiguity
            </li>
            <li>
              Free Response Questions - use these, when you want to check the ability to fill in the
              gap, or have questions that can have multiple correct answers that can be phrased in
              multiple ways
            </li>
            <li>
              JSON questions - designed to check how well AI models can generate valid JSON from
              text (e.g. emails or tickets)
            </li>
            <li>
              Mixed - this is for cases where you don't want to limit yourself. Mixed evaluations
              can contain all of the different task types
            </li>
          </Stack>
        </ul>
      ),
    },
    {
      title: 'Tasks CSV file',
      contents: [
        `Tasks get imported to our system by providing an appropriate CSV file or Google Sheet.
            The first non empty line of this file will be used as headers - all other lines are
            transformed into tasks. In order for us to be able to create the tasks, you will need to
            let us know how to map your CSV columns to task fields.`,
        `Columns mapping is done once you provide a file we can read - we'll first fetch the
            headers from your file, then let you specify the type of each column.`,
      ],
    },
    {
      title: 'Paraphrases',
      contents: [
        `Lots of models score really well on publicly available evaluations, but then turn out to
            do poorly when used on novel tasks. This is because the models are often trained on the
            evaluations, so can memorize the correct solutions.`,
        `To avoid such data leakage, we support paraphrases, where you can provide multiple
            alternative ways of phrasing each piece of text. When a field has paraphrases defined,
            we will always send paraphrases to models when running evaluations - this can minimize
            the chances of data leakage.`,
      ],
    },
  ],
  [FREE_RESPONSE_QUESTION]: [
    {
      title: 'Free Response Questions',
      contents: (
        <Stack spacing={2}>
          <Typography>
            Free Response Questions are tasks where the model is expected to answer a question with
            free form text.
          </Typography>
          <Typography>
            The default grader for Free Response Questions is to take the{' '}
            <ExternalA
              to="https://en.wikipedia.org/wiki/Cosine_similarity"
              text="cosine similarity"
            />
            of the response and the correct/incorrect answer, which is then compared with a cutoff
            value to decide if the answer is wrong or right.
          </Typography>
        </Stack>
      ),
    },
  ],
  [MULTIPLE_CHOICE_QUESTION]: [
    {
      title: 'Multiple Choice Questions',
      contents: (
        <Stack spacing={2}>
          <Typography>
            Mulitple Choice Questions are tasks where the model is expected to answer by selecting
            one of a limited set of possible answers.
          </Typography>
          <Typography>
            You can specify up to 10 correct and 20 incorrect answers, from which the model should
            choose one answer (at least with the default grader). The task is graded as correct if
            the model selects one of the correct answers, and incorrect if it selects one of the
            incorrect ones.
          </Typography>
          <Typography>
            Each task <b>must</b> provide at least one correct and one incorrect answer.
          </Typography>
        </Stack>
      ),
    },
  ],
  [BOOLEAN_QUESTION]: [
    {
      title: 'Boolean Questions',
      contents: (
        <Stack spacing={2}>
          <Typography>Boolean Questions are true/false questions.</Typography>
          <Typography>
            Each task <b>must</b> have a column specifing whether the task is correct. Any value of
            this column that is case-insensitive <code>1</code>, <code>true</code> or{' '}
            <code>yes</code> causes the task to be treated as a true statement. Anything else is
            False.
          </Typography>
          <Typography>
            For example, the following will be treated as true:
            <ul>
              <li>True</li>
              <li>tRuE</li>
              <li>1</li>
            </ul>
            While the folloing are interpreted as false:
            <ul>
              <li>not true</li>
              <li>false</li>
              <li>this is true</li>
            </ul>
          </Typography>
        </Stack>
      ),
    },
  ],
  [JSON_QUESTION]: [
    {
      title: 'JSON Tasks',
      contents: (
        <Stack spacing={2}>
          <Typography>
            JSON tasks are tasks that expect to get a valid JSON object as their response.
          </Typography>
          <Typography>
            If you want to get a specific object, you can provide what you want via the{' '}
            <code>Expected JSON</code> column. For a task with an expected JSON object provided to
            be graded as correct, it must have the same values for all fields. The model's response
            can contain extra fields, but all the fields in the expected JSON object must be the
            same as in the response object.
          </Typography>
        </Stack>
      ),
    },
    {
      title: 'JSON Schema',
      contents: (
        <Stack spacing={2}>
          <Typography>
            Most use cases involving JSON want the output in a specific format. Just because it's
            valid, doesn't make it useful. To give more control over how the results are graded, you
            can provide a{' '}
            <ExternalA
              to="https://json-schema.org/learn/getting-started-step-by-step"
              text="schema"
            />
            to check the JSON contents.
          </Typography>
          <Typography>
            <ExternalA to="https://www.jsonschemavalidator.net/" text="This" />
            tool has a lot of schemas for many common APIs, while{' '}
            <ExternalA to="https://extendsclass.com/json-schema-validator.html" text="this" />
            one can help you generate a schema from an example JSON object.
          </Typography>
        </Stack>
      ),
    },
  ],
  mixed: [
    {
      title: 'Mixed Tasks',
      contents: (
        <Stack spacing={2}>
          <Typography>
            Mixed tasks can be any of the supported tasks. This is a way for an evaluation to have
            multiple types of tasks - normally an evaluation is e.g. just multiple choice questions.
          </Typography>
          <Typography>
            Since we don't know what kind of tasks to expect, you <b>must</b> specify the type of
            each task. You do this by providing a <code>type</code> column in your CSV file, where
            each cell is one of{' '}
            {Object.keys(taskTypes)
              .map((v) => `"${v}"`)
              .join(', ')}
          </Typography>
        </Stack>
      ),
    },
  ],
}

const tasksDocs = (taskType?: TaskType) => [
  ...documentationItems.tasks,
  ...documentationItems[taskType || 'mixed'],
]

type EvaluationWizardProps = {
  initial?: EvaluationType
  onSave: (e: EvaluationType) => Promise<string | null>
  onCancel?: () => void
  basic?: boolean
  tasksOverwrite?: boolean
  tasks?: boolean
  customCode?: boolean
  title: string
}
const EvaluationWizard = ({
  initial,
  onSave,
  onCancel,
  basic,
  tasks,
  tasksOverwrite,
  customCode,
  title,
}: EvaluationWizardProps) => {
  const [evaluation, setEvaluation] = useState(initial || ({} as unknown as EvaluationType))

  const onChange = (field: string, val: any) => setEvaluation((e) => ({...e, [field]: val}))
  const onSubmit = async () => onSave(evaluation)

  const props = {evaluation, onChange}
  const steps = [
    basic && {
      label: 'Basic settings',
      Step: BasicSettings,
      documentation: documentationItems.settings,
      extraProps: props,
    },
    tasksOverwrite && {
      label: 'Tasks upload information',
      Step: TasksOverwriteInfo,
    },
    tasks && {
      label: 'Upload evaluation',
      nextLabel: 'Create evaluation',
      documentation: tasksDocs(evaluation.default_task_type),
      Step: TasksUploader,
      extraProps: {...props, onSave},
    },
    customCode && {
      label: 'Prompts & Graders',
      Step: CustomCode,
      extraProps: props,
    },
  ].filter(Boolean) as StepProps<any>[]
  if (steps.length) {
    return <Wizard title={title} onFinish={onSubmit} onCancel={onCancel} steps={steps} />
  }
  return null
}

type EvalWizardModalProps = {
  show?: boolean
  onCancel: () => void
} & EvaluationWizardProps
export const EvalWizardModal = (props: EvalWizardModalProps) => {
  return (
    <Modal open={props.show || false} onClose={props.onCancel}>
      <Box sx={{...modalStyle, m: 10, p: 0, border: 'none'}}>
        <EvaluationWizard {...props} />
      </Box>
    </Modal>
  )
}

export default EvaluationWizard
