import logo from './logo.svg';
import './App.css';

import React from 'react';
import Container from '@mui/material/Container';
import Typography from '@mui/material/Typography';
import { Prism as SyntaxHighlighter } from 'react-syntax-highlighter';
import { base16AteliersulphurpoolLight } from "react-syntax-highlighter/dist/esm/styles/prism"
import { createTheme, responsiveFontSizes, ThemeProvider } from '@mui/material/styles';
import Button from '@mui/material/Button';
import { BrowserRouter as Router, Routes, Route, Link } from 'react-router-dom';
import AppBar from '@mui/material/AppBar';
import Toolbar from '@mui/material/Toolbar';

const theme = createTheme({
  palette: {
    primary: {
      main: '#005e9c',
      light: '#4791db',
      dark: '#005e9c',
      contrastText: '#fff',
    },
    secondary: {
      main: '#f50057',
      light: '#ff4081',
      dark: '#c51162',
      contrastText: '#fff',
    }
  },
  typography: {
    fontFamily: 'Helvetica',
    h1: {
      fontWeight: 600,
      fontSize: '4.5rem',
      lineHeight: '2',
      letterSpacing: '-0.01562em',
      spacing: '1rem',
      textAlign: 'left'
    },
    h2: {
      fontWeight: 900,
      fontSize: '2rem',
      lineHeight: '2',
      letterSpacing: '-0.00833em',
      paddingTop: '.5rem',
      textAlign: 'left'
    },
    h3: {
      fontWeight: 600,
      fontSize: '1.2rem',
      lineHeight: '1.2',
      letterSpacing: '-0.00833em',
      textAlign: 'left'
    },
    h4: {
      fontWeight: 550,
      fontSize: '1.1rem',
      lineHeight: '1.1',
      letterSpacing: '-0.00833em',
      textAlign: 'left'
    },
    body1: {
      fontWeight: 400,
      fontSize: '1rem',
      lineHeight: '1.5',
      letterSpacing: '0.00938em',
      textAlign: 'left'
    },
    body2: {
      fontWeight: 400,
      fontSize: '0.7rem',
      lineHeight: '1.5',
      letterSpacing: '0.01071em',
      textAlign: 'left'
    },
    topMenuBoxes: {
      flex: 1,
      textAlign: "center"
    }
  },
  components: {
    MuiButton: {
      defaultProps: {
        disableElevation: true,
      },
      styleOverrides: {
        root: {
          borderRadius: '5px',
        },
      },
    },
    MuiTextField: {
      styleOverrides: {
        root: {
          borderRadius: '5px',
        },
      },
    },
    MuiListItem: {
      styleOverrides: {
        root: {
          borderBottom: '2px solid #ccc',
          '&:last-of-type': {
            borderBottom: 'none',
          },
        },
      },
    },
  },
});

responsiveFontSizes(theme)


function App() {
  return (
    <Router>
        <ThemeProvider theme={theme}>
          <AppBar position="static">
            <Toolbar sx={{textAlign: "center", justifyContent: "center"}}>
              <Button color="inherit" component={Link} to="/">
                Blog
              </Button>
              <Button color="inherit" component={Link} to="https://github.com/nostrebor/hiring-hn-search">
                Github
              </Button>
            </Toolbar>
          </AppBar>
          <Container maxWidth="md">
            <Routes>
            <Route exact path="/" element = {
                  <Blog/>
                }/>
            </Routes>
            <Routes exact path="/index.html" element = { <Blog/> }/>
          </Container>
        </ThemeProvider>
      </Router>
  );
}

function Blog() {
  const runHandler = `def poll_for_results(client, run, listings):
  while run.status in ['queued', 'in_progress']:
    time.sleep(0.05)
    run = client.beta.threads.runs.retrieve(
      thread_id=run.thread_id,
      run_id=run.id
    )
  if run.status == "requires_action":
    ## handle tool calls here
  elif run.status == "failed":
    ## handle failures
  else:
    ## we have our result!
    message_list = client.beta.threads.messages.list(
      thread_id=run.thread_id,
      order="asc"
    )
  `
  const templateString = `template = """
  Given the following extracted parts of a hacker news story ("SOURCES") and a question ("QUESTION"), create a final answer.
  Don't try to make up an answer and use the text in the SOURCES only for the answer. If you don't know the answer, just say that you don't know.
  Provide your answer in the following json format. There must be one source per entry and you cannot use 'all sources'.
  {{"docs": [ 
      {{
          "source": source1
    }}
  ]}}
  =========
  QUESTION: {question}
  =========
  SOURCES:
  {summaries}
  =========
  ANSWER:
  """
  `
  return (
    <Container maxWidth="lg">
      <Typography variant="h2" gutterBottom>
        Who's Hiring Search
      </Typography>
      <Typography variant="h3" gutterBottom> 
        Querying monthly Hacker News job postings using OpenAI Assistants
      </Typography>
      <Typography variant="body1" gutterBottom>
        <p>
          I met someone at a potluck who will be finishing her masters degree in the next few months. She was having trouble finding jobs. I asked some of the usual questions: "What do you enjoy doing? What kind of roles are you targeting? What was your job experience before school?" I made some on the spot recommendations, but then went home and decided to look over Hacker News' Who's Hiring?
        </p>
        <p>
          What I love about Who's Hiring is that it's usually a list of new job postings that can get you a referral. But the second you get out of looking for engineering jobs, you have your first filter. Let's say we're looking for Dev Ops roles. Do I search for Dev Ops? What about Infrastructure Engineer? Cloud Engineer? It's quite easy to look for the wrong word and miss job postings. When you want to add another constraint, like a city or an industry, it just becomes impossible to do well from the browser.
        </p>
        <p>
          I wanted a way for someone to search Who's Hiring like they would talk to a friend. RAG, or Retrieval Augmented Generation, is an approach that lets you use external information in combination with Generative AI to search through your own data with natural language. While I'd done a similar proof of concept with a vector database, I wanted to use a number of recently released OpenAI features to see how they performed.
        </p>
        <p>
          There have been a ton of posts on RAG. Most are undifferentiated. This blog will look at the Assistants model and compare it against a Vector Database approach. The GitHub repo will let you spin up your own instance of the Assistant if that's more your speed.
        </p>
        <p>
          At Dev Day, OpenAI released the gpt-4–1106-preview model which introduced 'Assistants'. Assistants allow you to use built in storage, run tools like your custom functions or ChatGPT managed Code Interpreter (in parallel!), and persist conversations in individual threads. That means that we can create an Assistant to answer natural language requests about job listings that uses the 'Who's Hiring' thread as a data source and create a function that enforces we get the results back in our expected format. We'll compare this with traditional vector database enabled RAG.
        </p>
        <p>
          The finished product looks like the following:
        </p>
        <VideoPlayer src='AssistantsDemo720p.mov' />
      </Typography>
      <p></p>
      <Typography variant="h3">OpenAI Assistants Application</Typography>
      <Typography variant="body1">
        <p>
          There are three parts to our Assistants orchestrator. A Hacker News scraper which returns all of the comments for a particular HN story, the Assistant builder which takes those in a file format, and a function definition that defines the expected format of a response.
        </p>
        <img src='AssistantsBuildFlow.png' width='100%'></img>
        <p>
          An additional consideration is that OpenAI Assistants decides whether to index files using a Vector Database or to pass the entire file into the context window. This comes with large cost in terms of token consumption and needs to be worked around. From the documentation:
        </p>
      </Typography>
      <Typography variant="body1">
        <blockquote>
          <i>
            The model then decides when to retrieve content based on the user Messages. The Assistants API automatically chooses between two retrieval techniques: it either passes the file content in the prompt for short documents, or performs a vector search for longer documents. Retrieval currently optimizes for quality by adding all relevant content to the context of model calls. We plan to introduce other retrieval strategies to enable developers to choose a different tradeoff between retrieval quality and model usage cost.
          </i>
        </blockquote>
      </Typography>
      <Typography variant="body1">
        <p>
          The length of Who's Hiring, including JSON formatting breaking out comment IDs and the message text, is typically in the 400,000-500,000 range. Both of these converted to tokens (approximated by dividing by 4) are typically going to be less than the 128,000 token context window used by the GPT4 version that supports Assistants. When using this file-in-context mode, you can expect extremely high token consumption. By adding filler comments to get over the 128,000 token length, performance was improved.
        </p>
      </Typography>
      <p></p>
      <Typography variant="h3">Search Handling API</Typography>
      <Typography variant="body1">
        <p>
          After we build our Assistant, we need an API boundary that our web application can call out to to handle forwarding those to our ChatGPT Assistant. This lets us keep our OpenAI API Key on the API server. As a simple proof of concept, I created a Flask app, but in practice this is more likely to be an API Gateway that can handle authentication, authorization, and usage tracking for the requests.
        </p>
        <img src='APICallFlow.png' width='100%'></img>
        <p>
          The Search Handling API needs to work around the Assistants threading model. As part of a run against an Assistant, you are returned a thread which handles the run step-by-step. When we receive an event that isn’t just ChatGPT doing its thing, we need to build in handling. This is given to us by a status of ‘required_action’. Practically, when we receive a call to our JSON formatter, we’re responsible for submitting the result of the tool back to ChatGPT.
        </p>
        <SyntaxHighlighter children={runHandler} language="python" style={base16AteliersulphurpoolLight}></SyntaxHighlighter>
        <p>
          We then return the results of the call back to the web app in the structured format we’ve directed ChatGPT to generate.
        </p>
      </Typography>
      <Typography variant="h3">React Application</Typography>
      <Typography variant="body1">
        <p>Finally, we have a simple React App which handles visualizing the results.</p>
        <Typography variant="h4">
          Viewing Job Listings
        </Typography>
        <p>We also scrape the post here to render an initial page of job listings. In the React App, we map comment ids to their job listing content. We tell React to rerender the page when the listings we want to render change. </p>
        <Typography variant="h4">
          Searching Job Listings
        </Typography>
        <p>We have a search box that sends requests to our Marqo app. When we receive a response, we set the children to the list of sources returned by our endpoint which changes our view.</p>
      </Typography>
      <Typography variant="h3">Vector Database Approach</Typography>
      <Typography variant="body1"> 
        <p>
          The Vector Database Approach differs in a few ways. Rather than just write a flat-file back to OpenAI Assistants, we generate the encodings for each of the comments using a preferred model. The upfront indexing means that we have a performant vector database to work against and get back initial batches of results based on a natural language input.
        </p>
        <p>
          These initial batches of results are often quite good, but they judge the similarity based on the proximity (defined by common vector distance approaches) in the databases, and often results can be similar like a fork and a spoon are similar when you’re eating soup. ChatGPT can then be used as a sanity check against the most similar results returned by the VectorDB.
        </p>
      </Typography>
      <p></p>
      <VideoPlayer src="Demo.mov" />
      <p></p>
      <Typography variant="h3">Comparison</Typography>
      
      <Typography variant="body1">
        <p>The natural next question is when to use OpenAI Assistants or a Vector Database for RAG.</p>
        <img src="FeatureComparison.png" width="100%"/>
        <Typography variant="h4">
          <p>Integrated Managed Service</p>
        </Typography>
        <p>
          There's no other integrated end to end RAG and Generation solution that I'm aware of. Combining in the feature to chain functions together directly from the Assistants interface, and you have a powerful tool to handle a number of use cases. I see this being especially useful for prototyping use cases that will eventually target GPT4. Nothing is stopping you from seeing what a RAG enabled pipeline using tools like function chaining from langchain could look like in production. You can get this up and running directly from the playground, or build against the API to see how it would work in your application. This is a really comprehensive feature behind a simple consumption based API with scalability out of the box.
        </p>
        <Typography variant="h4">
          <p>Performance</p>
        </Typography>
        <p>
          While the API is very conceptually clean and getting the PoC up-and-running was quick, the performance of the API has been more than ten times as slow as using a combination of Vector Databases and GPT4. I attribute a lot of this to the Assistants runtime. You can see the steps that are taken, and often there a multiple requests to our function call rather than a single batch of listings returned. Sometimes the retrieval tool is used multiple times. Beyond that, the system has to figure out to use the retrieval tool to begin with. When using a Vector DB, we can direct the flow to always retrieve first and control the embedding we use to figure out the appropriate dimensionality for retrieval speed versus quality. I was able to get similar results with much lower dimensional embeddings.
        </p>
        <Typography variant="h4">
          <p>Content Updates</p>
        </Typography>
        <p>
          The OpenAI approach doesn't provide an efficient way to update the files you're retrieving against. This means if you have upfront knowledge that you want OpenAI to use, the Assistants retrieval tool works beautifully. If you have regular insertions, updates, or deletions to the data you want to be used by the LLM, you need to recreate a file and add the association back to the Assistant. When you do this, it will reindex the file contents which takes a good amount of time. Compared with the Vector DB approach, where these access patterns are first-class features, you can see there are a number of use cases that just aren't meant for Assistants.
        </p>
        <Typography variant="h4">
          <p>Embeddings</p>
        </Typography>
        <p>
          The largest difference from a design perspective is control over what you're embedding and how. Specific embeddings might be more useful in specific contexts depending on your performance requirements, quality requirements, input data, and the cost that you're willing to pay for the underlying compute infrastructure. The content of the embedding is also important. For Who's Hiring, there's metadata that we might want to include in each chunk of the data which we embed. Regardless of if we choose to chunk based on sentences, word count, or some other feature of our text, we'd want to include things like the location of the listing, whether it's on site, and the roles that are being hired for in each chunk. This lets us handle queries like `ROLE jobs in CITY doing FUNCTION` reliably. Otherwise the embedding that we match might not contain the full context we need for the query. With Assistants, we have no control over how it decides to store the embeddings in the managed vector database.
        </p>
        <Typography variant="h4">
          <p>Quality</p>
        </Typography>
        <p>
          The quality of results returned by Assistants are good. With the sample app, you may notice erroneous results -- this is often because of my direction in the prompt to return multiple results. When you're searching for queries that may have many semantically similar matches based on common words in the search phrase this matters. For instance, "Software engineering jobs using AWS" contains very few words that are likely to be unique in the search space. The better embeddings with the GPT4 beta gave much more relevant results. However, we can also calculate the embeddings for the vector DB using GPT4 embeddings. I haven't done this due to the token constraints and hitting daily usage limits when testing. I imagine that the quality difference here would narrow using a like-for-like embedding.
        </p>
        <Typography variant="h4">
          <p>Overview</p>
        </Typography>
        <p>
          I think we'll see a number of proof of concepts built out using assistants. The fact that you can validate what RAG and Functions can return is invaluable, especially given that integration into your app is a few dozen lines of code with an Assistant that has already been created. For apps where the files being retreived from are rarely or never modified, I think many of these will make it into production. For instance, things like FAQ bots, where the answers are unlikely to change, can likely be spun up by a semi-technical user and moved to prod. But for latency sensitive applications like the search that we're doing, I think Vector DBs will remain the preferred approach. It is a beta, and I'm sure there's more planned for Assistants. I'm looking forward to seeing what you can do with them in a year!
        </p>
      </Typography>
  </Container>
  )
}

function VideoPlayer(props) {
  return (
    <video 
      width="100%" 
      height="100%" 
      controls
      autoPlay
      loop
      muted
      playsInline
      style={{ objectFit: 'cover' }} 
    >
      <source src= { props.src } type="video/mp4"/>
    </video>
  );
}

export default App;
