code-talk/sql-code-talk

#!/usr/bin/env python

from langchain.document_loaders import UnstructuredFileLoader, WebBaseLoader, YoutubeLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.utils import filter_complex_metadata
import os
import json
import requests
import re
import sys
from urllib.parse import urlparse, parse_qs
import nltk
from tqdm import tqdm
from markdown_pdf import Section, MarkdownPdf


pdf = MarkdownPdf(toc_level=2)


file_input = sys.argv[1]

filename, file_extension = os.path.splitext(file_input)
title = os.path.basename(filename).replace(file_extension, '')


pdf.add_section(Section(f"# {title}\n", toc=True))


model = "llama3:latest"
# model = "mistral:latest"

vector_store = None
retriever = None
chain = None
docs = None


def is_bulletpoint(s):
    for char in s[:5]:
        if char.isdigit():
            return True
    return False


def generate_text(model, prompt, system=""):
    url = "http://localhost:11434/api/generate"
    data = {
        "model": model,
        "prompt": prompt,
        "system": system,
        "stream": False,
        "options": {
            "temperature": 0.4,
        }
    }
    response = requests.post(url, json=data)
    text = json.loads(response.text)
    return text["response"]


def isyoutubevideo(youtube_url):
    parsed_url = urlparse(youtube_url)
    query_params = parse_qs(parsed_url.query)
    if 'v' in query_params:
        return True
    elif "youtu.be" in parsed_url:
        return True
    else:
        return False


def is_url(string):
    pattern = r"^https?://"
    return bool(re.search(pattern, string))


# text_splitter = RecursiveCharacterTextSplitter(chunk_size=2048, chunk_overlap=100)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=4096, chunk_overlap=100)

# Checking if url or if file path
if is_url(file_input):

    # See if youtube link
    if isyoutubevideo(file_input):
        print("Loading youtube video...")
        # Prepare youtube url for transcript extraction
        #parsed_url = urlparse(file_input)
        #query_params = parse_qs(parsed_url.query)
        # Get youtube video id
        #video_id = query_params['v'][0]
        # Load for emmbeddings
        video_id = file_input[-11:]
        docs = YoutubeLoader(video_id).load()

    else:
        print("Loading url...")
        # Extract and load webpage text
        docs = WebBaseLoader(file_input).load()

    # Prepare text
    docs = text_splitter.split_documents(docs)
    docs = filter_complex_metadata(docs)

else:
    # Load File
    try:
        docs = UnstructuredFileLoader(file_input).load()
    except:
        docs = TextLoader(file_input).load()
    # Prepare file
    docs = text_splitter.split_documents(docs)
    docs = filter_complex_metadata(docs)


outline = ""
pre_summery = ""


print("\nNumber of Chunks: ", len(docs))


t = ""

for a in docs:
    t += a.page_content

nltk_tokens_init = nltk.word_tokenize(t)
print("Number of Tokens: " + str(len(nltk_tokens_init)) + "\n")


bar = tqdm(desc="Loading…", ascii=False, ncols=100, total=len(docs))
count = 0
for x in docs:
    count += 1
    bar.update()
    context = str(x.page_content)

    chunk_text = context

    system_prompt = """
You are a professional code summarizer. You will be be given a SQL query in chunk section.
Take each chunk and create a very short concise single paragraph summery. The chunk will be under the # CHUNK heading. Only output the summery.
Do not under any circumstance output the # CHUNK section, SQL code, or bullet points.
"""
    prompt = f"""
Write a paragraph summary of the following CHUNK of sql code.

# CHUNK
{chunk_text}
"""

    outline = generate_text(model, prompt, system_prompt)

    bullet_point = False

    for x in outline.split("\n"):
        if is_bulletpoint(x):
            bullet_point = True

    if is_bulletpoint(outline.split("\n")[0]):
        outline =  " The SQL script performs the following tasks:\n" + outline.replace("\n", "\n\t")
    elif bullet_point:
        outline = outline.replace("\n", "\n\t")
    else:
        outline = outline.replace("\n", " ")

    pre_summery += "\n\n" + str(count) + "." + outline

    # print("\n\n--------------------------------------------------------------------------------------------")
    # print(outline)


bar.close()

nltk_tokens = nltk.word_tokenize(pre_summery)
print("\nNumber of Tokens: ", len(nltk_tokens))
print("Compression Ratio: ", round(len(nltk_tokens_init)/len(nltk_tokens), 1))


# Final Summary
system_prompt = "You are an expert summarizer. Your Job it to take all the individual sections under each bullet point. Make sure that the summary is long and detailed. Do not mention anything about sections or chunks and only summarize in paragraph form. Never let your summary's contain outlines or built points"

prompt = f"""
Here are a bunch of bulit points. Please summerize them:

{pre_summery}
"""

final_summery = generate_text(model, prompt, system_prompt)


print("Done")

pdf.add_section(Section(f"## Basic Overview\n{final_summery}\n\n"))
pdf.add_section(Section(f"## Code Outline\n{pre_summery}\n\n"))

pdf.meta["title"] = title
pdf.meta["author"] = "locker98"
pdf.save(f"{title}.pdf")

# print(f"\n\n\n{pre_summery}")
first commit 2024-04-19 02:37:11 +00:00			`#!/usr/bin/env python`

			`from langchain.document_loaders import UnstructuredFileLoader, WebBaseLoader, YoutubeLoader, TextLoader`
			`from langchain.text_splitter import RecursiveCharacterTextSplitter`
			`from langchain.vectorstores.utils import filter_complex_metadata`
			`import os`
			`import json`
			`import requests`
			`import re`
			`import sys`
			`from urllib.parse import urlparse, parse_qs`
			`import nltk`
			`from tqdm import tqdm`
			`from markdown_pdf import Section, MarkdownPdf`


			`pdf = MarkdownPdf(toc_level=2)`


			`file_input = sys.argv[1]`

			`filename, file_extension = os.path.splitext(file_input)`
			`title = os.path.basename(filename).replace(file_extension, '')`


			`pdf.add_section(Section(f"# {title}\n", toc=True))`


orginized 2024-04-19 03:02:43 +00:00			`model = "llama3:latest"`
			`# model = "mistral:latest"`
first commit 2024-04-19 02:37:11 +00:00
			`vector_store = None`
			`retriever = None`
			`chain = None`
			`docs = None`

orginized 2024-04-19 03:02:43 +00:00
first commit 2024-04-19 02:37:11 +00:00			`def is_bulletpoint(s):`
			`for char in s[:5]:`
			`if char.isdigit():`
			`return True`
			`return False`

orginized 2024-04-19 03:02:43 +00:00
			`def generate_text(model, prompt, system=""):`
first commit 2024-04-19 02:37:11 +00:00			`url = "http://localhost:11434/api/generate"`
			`data = {`
			`"model": model,`
			`"prompt": prompt,`
			`"system": system,`
			`"stream": False,`
			`"options": {`
			`"temperature": 0.4,`
			`}`
			`}`
			`response = requests.post(url, json=data)`
			`text = json.loads(response.text)`
			`return text["response"]`


			`def isyoutubevideo(youtube_url):`
			`parsed_url = urlparse(youtube_url)`
			`query_params = parse_qs(parsed_url.query)`
			`if 'v' in query_params:`
			`return True`
			`elif "youtu.be" in parsed_url:`
			`return True`
			`else:`
			`return False`


			`def is_url(string):`
			`pattern = r"^https?://"`
			`return bool(re.search(pattern, string))`



orginized 2024-04-19 03:02:43 +00:00			`# text_splitter = RecursiveCharacterTextSplitter(chunk_size=2048, chunk_overlap=100)`
first commit 2024-04-19 02:37:11 +00:00			`text_splitter = RecursiveCharacterTextSplitter(chunk_size=4096, chunk_overlap=100)`

			`# Checking if url or if file path`
			`if is_url(file_input):`

			`# See if youtube link`
orginized 2024-04-19 03:02:43 +00:00			`if isyoutubevideo(file_input):`
first commit 2024-04-19 02:37:11 +00:00			`print("Loading youtube video...")`
			`# Prepare youtube url for transcript extraction`
			`#parsed_url = urlparse(file_input)`
			`#query_params = parse_qs(parsed_url.query)`
			`# Get youtube video id`
			`#video_id = query_params['v'][0]`
			`# Load for emmbeddings`
			`video_id = file_input[-11:]`
			`docs = YoutubeLoader(video_id).load()`

			`else:`
			`print("Loading url...")`
			`# Extract and load webpage text`
			`docs = WebBaseLoader(file_input).load()`

			`# Prepare text`
			`docs = text_splitter.split_documents(docs)`
			`docs = filter_complex_metadata(docs)`

			`else:`
			`# Load File`
			`try:`
			`docs = UnstructuredFileLoader(file_input).load()`
			`except:`
			`docs = TextLoader(file_input).load()`
			`# Prepare file`
			`docs = text_splitter.split_documents(docs)`
			`docs = filter_complex_metadata(docs)`


			`outline = ""`
			`pre_summery = ""`


			`print("\nNumber of Chunks: ", len(docs))`


			`t = ""`

			`for a in docs:`
			`t += a.page_content`

			`nltk_tokens_init = nltk.word_tokenize(t)`
			`print("Number of Tokens: " + str(len(nltk_tokens_init)) + "\n")`


			`bar = tqdm(desc="Loading…", ascii=False, ncols=100, total=len(docs))`
			`count = 0`
			`for x in docs:`
			`count += 1`
			`bar.update()`
			`context = str(x.page_content)`

			`chunk_text = context`

			`system_prompt = """`
			`You are a professional code summarizer. You will be be given a SQL query in chunk section.`
			`Take each chunk and create a very short concise single paragraph summery. The chunk will be under the # CHUNK heading. Only output the summery.`
			`Do not under any circumstance output the # CHUNK section, SQL code, or bullet points.`
			`"""`
			`prompt = f"""`
			`Write a paragraph summary of the following CHUNK of sql code.`

			`# CHUNK`
			`{chunk_text}`
			`"""`
orginized 2024-04-19 03:02:43 +00:00
first commit 2024-04-19 02:37:11 +00:00			`outline = generate_text(model, prompt, system_prompt)`
orginized 2024-04-19 03:02:43 +00:00
first commit 2024-04-19 02:37:11 +00:00			`bullet_point = False`
orginized 2024-04-19 03:02:43 +00:00
first commit 2024-04-19 02:37:11 +00:00			`for x in outline.split("\n"):`
			`if is_bulletpoint(x):`
			`bullet_point = True`

			`if is_bulletpoint(outline.split("\n")[0]):`
			`outline = " The SQL script performs the following tasks:\n" + outline.replace("\n", "\n\t")`
			`elif bullet_point:`
			`outline = outline.replace("\n", "\n\t")`
			`else:`
			`outline = outline.replace("\n", " ")`

			`pre_summery += "\n\n" + str(count) + "." + outline`

orginized 2024-04-19 03:02:43 +00:00			`# print("\n\n--------------------------------------------------------------------------------------------")`
			`# print(outline)`
first commit 2024-04-19 02:37:11 +00:00

			`bar.close()`

			`nltk_tokens = nltk.word_tokenize(pre_summery)`
			`print("\nNumber of Tokens: ", len(nltk_tokens))`
orginized 2024-04-19 03:02:43 +00:00			`print("Compression Ratio: ", round(len(nltk_tokens_init)/len(nltk_tokens), 1))`
first commit 2024-04-19 02:37:11 +00:00

			`# Final Summary`
			`system_prompt = "You are an expert summarizer. Your Job it to take all the individual sections under each bullet point. Make sure that the summary is long and detailed. Do not mention anything about sections or chunks and only summarize in paragraph form. Never let your summary's contain outlines or built points"`

			`prompt = f"""`
			`Here are a bunch of bulit points. Please summerize them:`

			`{pre_summery}`
			`"""`

			`final_summery = generate_text(model, prompt, system_prompt)`


			`print("Done")`

			`pdf.add_section(Section(f"## Basic Overview\n{final_summery}\n\n"))`
			`pdf.add_section(Section(f"## Code Outline\n{pre_summery}\n\n"))`

			`pdf.meta["title"] = title`
			`pdf.meta["author"] = "locker98"`
			`pdf.save(f"{title}.pdf")`

orginized 2024-04-19 03:02:43 +00:00			`# print(f"\n\n\n{pre_summery}")`
first commit 2024-04-19 02:37:11 +00:00