orginized

This commit is contained in:
test 2024-04-18 23:02:43 -04:00
parent 786f3934f9
commit 565ee99aac
2 changed files with 55 additions and 83 deletions

View File

@ -1,25 +1,14 @@
#!/usr/bin/env python #!/usr/bin/env python
#from langchain.embeddings import FastEmbedEmbeddings
#from langchain.schema.output_parser import StrOutputParser
from langchain.document_loaders import UnstructuredFileLoader, WebBaseLoader, YoutubeLoader, TextLoader from langchain.document_loaders import UnstructuredFileLoader, WebBaseLoader, YoutubeLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.text_splitter import RecursiveCharacterTextSplitter
#from langchain.schema.runnable import RunnablePassthrough
#from langchain.prompts import PromptTemplate
#from langchain.schema.document import Document
from langchain.vectorstores.utils import filter_complex_metadata from langchain.vectorstores.utils import filter_complex_metadata
#from langchain_community.embeddings import OllamaEmbeddings
#import mimetypes
import os import os
import json import json
import requests import requests
#from pathlib import Path
from rich.markdown import Markdown
from rich.console import Console
import re import re
import sys import sys
from urllib.parse import urlparse, parse_qs from urllib.parse import urlparse, parse_qs
#from youtube_transcript_api import YouTubeTranscriptApi
import nltk import nltk
from tqdm import tqdm from tqdm import tqdm
from markdown_pdf import Section, MarkdownPdf from markdown_pdf import Section, MarkdownPdf
@ -37,21 +26,23 @@ title = os.path.basename(filename).replace(file_extension, '')
pdf.add_section(Section(f"# {title}\n", toc=True)) pdf.add_section(Section(f"# {title}\n", toc=True))
model = "dolphin-mistral:latest" model = "llama3:latest"
#model = "mistral:latest" # model = "mistral:latest"
vector_store = None vector_store = None
retriever = None retriever = None
chain = None chain = None
docs = None docs = None
def is_bulletpoint(s): def is_bulletpoint(s):
for char in s[:5]: for char in s[:5]:
if char.isdigit(): if char.isdigit():
return True return True
return False return False
def generate_text(model, prompt, system = ""):
def generate_text(model, prompt, system=""):
url = "http://localhost:11434/api/generate" url = "http://localhost:11434/api/generate"
data = { data = {
"model": model, "model": model,
@ -84,15 +75,14 @@ def is_url(string):
#text_splitter = RecursiveCharacterTextSplitter(chunk_size=2048, chunk_overlap=100) # text_splitter = RecursiveCharacterTextSplitter(chunk_size=2048, chunk_overlap=100)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=4096, chunk_overlap=100) text_splitter = RecursiveCharacterTextSplitter(chunk_size=4096, chunk_overlap=100)
# Checking if url or if file path # Checking if url or if file path
if is_url(file_input): if is_url(file_input):
# See if youtube link # See if youtube link
if isyoutubevideo(file_input) == True: if isyoutubevideo(file_input):
print("Loading youtube video...") print("Loading youtube video...")
# Prepare youtube url for transcript extraction # Prepare youtube url for transcript extraction
#parsed_url = urlparse(file_input) #parsed_url = urlparse(file_input)
@ -108,7 +98,6 @@ if is_url(file_input):
# Extract and load webpage text # Extract and load webpage text
docs = WebBaseLoader(file_input).load() docs = WebBaseLoader(file_input).load()
# Prepare text # Prepare text
docs = text_splitter.split_documents(docs) docs = text_splitter.split_documents(docs)
docs = filter_complex_metadata(docs) docs = filter_complex_metadata(docs)
@ -124,10 +113,6 @@ else:
docs = filter_complex_metadata(docs) docs = filter_complex_metadata(docs)
outline = "" outline = ""
pre_summery = "" pre_summery = ""
@ -180,21 +165,17 @@ Write a paragraph summary of the following CHUNK of sql code.
else: else:
outline = outline.replace("\n", " ") outline = outline.replace("\n", " ")
pre_summery += "\n\n" + str(count) + "." + outline pre_summery += "\n\n" + str(count) + "." + outline
#print("\n\n--------------------------------------------------------------------------------------------") # print("\n\n--------------------------------------------------------------------------------------------")
#print(outline) # print(outline)
bar.close() bar.close()
nltk_tokens = nltk.word_tokenize(pre_summery) nltk_tokens = nltk.word_tokenize(pre_summery)
print("\nNumber of Tokens: ", len(nltk_tokens)) print("\nNumber of Tokens: ", len(nltk_tokens))
print("Compression Ratio: ", round(len(nltk_tokens_init)/len(nltk_tokens),1)) print("Compression Ratio: ", round(len(nltk_tokens_init)/len(nltk_tokens), 1))
# Final Summary # Final Summary
@ -218,7 +199,5 @@ pdf.meta["title"] = title
pdf.meta["author"] = "locker98" pdf.meta["author"] = "locker98"
pdf.save(f"{title}.pdf") pdf.save(f"{title}.pdf")
# print(f"\n\n\n{pre_summery}")
#print(f"\n\n\n{pre_summery}")

View File

@ -1,25 +1,14 @@
#!/usr/bin/env python #!/usr/bin/env python
#from langchain.embeddings import FastEmbedEmbeddings
#from langchain.schema.output_parser import StrOutputParser
from langchain.document_loaders import UnstructuredFileLoader, WebBaseLoader, YoutubeLoader, TextLoader from langchain.document_loaders import UnstructuredFileLoader, WebBaseLoader, YoutubeLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.text_splitter import RecursiveCharacterTextSplitter
#from langchain.schema.runnable import RunnablePassthrough
#from langchain.prompts import PromptTemplate
#from langchain.schema.document import Document
from langchain.vectorstores.utils import filter_complex_metadata from langchain.vectorstores.utils import filter_complex_metadata
#from langchain_community.embeddings import OllamaEmbeddings
#import mimetypes
import os import os
import json import json
import requests import requests
#from pathlib import Path
from rich.markdown import Markdown
from rich.console import Console
import re import re
import sys import sys
from urllib.parse import urlparse, parse_qs from urllib.parse import urlparse, parse_qs
#from youtube_transcript_api import YouTubeTranscriptApi
import nltk import nltk
from tqdm import tqdm from tqdm import tqdm
from markdown_pdf import Section, MarkdownPdf from markdown_pdf import Section, MarkdownPdf
@ -37,15 +26,23 @@ title = os.path.basename(filename).replace(file_extension, '')
pdf.add_section(Section(f"# {title}\n", toc=True)) pdf.add_section(Section(f"# {title}\n", toc=True))
model = "dolphin-mistral:latest" model = "llama3:latest"
#model = "mistral:latest" # model = "mistral:latest"
vector_store = None vector_store = None
retriever = None retriever = None
chain = None chain = None
docs = None docs = None
def generate_text(model, prompt, system = ""):
def is_bulletpoint(s):
for char in s[:5]:
if char.isdigit():
return True
return False
def generate_text(model, prompt, system=""):
url = "http://localhost:11434/api/generate" url = "http://localhost:11434/api/generate"
data = { data = {
"model": model, "model": model,
@ -53,7 +50,7 @@ def generate_text(model, prompt, system = ""):
"system": system, "system": system,
"stream": False, "stream": False,
"options": { "options": {
"temperature": 0.6, "temperature": 0.4,
} }
} }
response = requests.post(url, json=data) response = requests.post(url, json=data)
@ -78,15 +75,14 @@ def is_url(string):
#text_splitter = RecursiveCharacterTextSplitter(chunk_size=2048, chunk_overlap=100) # text_splitter = RecursiveCharacterTextSplitter(chunk_size=2048, chunk_overlap=100)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=4096, chunk_overlap=100) text_splitter = RecursiveCharacterTextSplitter(chunk_size=4096, chunk_overlap=100)
# Checking if url or if file path # Checking if url or if file path
if is_url(file_input): if is_url(file_input):
# See if youtube link # See if youtube link
if isyoutubevideo(file_input) == True: if isyoutubevideo(file_input):
print("Loading youtube video...") print("Loading youtube video...")
# Prepare youtube url for transcript extraction # Prepare youtube url for transcript extraction
#parsed_url = urlparse(file_input) #parsed_url = urlparse(file_input)
@ -102,7 +98,6 @@ if is_url(file_input):
# Extract and load webpage text # Extract and load webpage text
docs = WebBaseLoader(file_input).load() docs = WebBaseLoader(file_input).load()
# Prepare text # Prepare text
docs = text_splitter.split_documents(docs) docs = text_splitter.split_documents(docs)
docs = filter_complex_metadata(docs) docs = filter_complex_metadata(docs)
@ -118,10 +113,6 @@ else:
docs = filter_complex_metadata(docs) docs = filter_complex_metadata(docs)
outline = "" outline = ""
pre_summery = "" pre_summery = ""
@ -134,8 +125,8 @@ t = ""
for a in docs: for a in docs:
t += a.page_content t += a.page_content
nltk_tokens = nltk.word_tokenize(t) nltk_tokens_init = nltk.word_tokenize(t)
print("Number of Tokens: " + str(len(nltk_tokens)) + "\n") print("Number of Tokens: " + str(len(nltk_tokens_init)) + "\n")
bar = tqdm(desc="Loading…", ascii=False, ncols=100, total=len(docs)) bar = tqdm(desc="Loading…", ascii=False, ncols=100, total=len(docs))
@ -149,44 +140,45 @@ for x in docs:
system_prompt = """ system_prompt = """
You are a professional code summarizer. You will be be given a SQL query in chunk section. You are a professional code summarizer. You will be be given a SQL query in chunk section.
Take each chunk and create a very short concise summery. The chunk will be under the # CHUNK heading. Only output the summery. Take each chunk and create a very short concise single paragraph summery. The chunk will be under the # CHUNK heading. Only output the summery.
Do not under any circumstance output the # CHUNK section or any SQL code. Do not under any circumstance output the # CHUNK section, SQL code, or bullet points.
""" """
prompt = f""" prompt = f"""
Write a paragraph summary of the following CHUNK of sql code.
# CHUNK # CHUNK
{chunk_text} {chunk_text}
""" """
## SUMMERY
#{outline}
#"""
outline = generate_text(model, prompt, system_prompt) outline = generate_text(model, prompt, system_prompt)
outline = outline.replace("\n", " ") bullet_point = False
pre_summery += "\n\n" + str(count) + ". " + outline for x in outline.split("\n"):
#print("\n\n\n") if is_bulletpoint(x):
#print(outline) bullet_point = True
#print("\n\n\n")
if is_bulletpoint(outline.split("\n")[0]):
outline = " The SQL script performs the following tasks:\n" + outline.replace("\n", "\n\t")
elif bullet_point:
outline = outline.replace("\n", "\n\t")
else:
outline = outline.replace("\n", " ")
pre_summery += "\n\n" + str(count) + "." + outline
# print("\n\n--------------------------------------------------------------------------------------------")
# print(outline)
#print("\n\n\n-----------------------------------------------------------------------------------\n\n\n")
#print(pre_summery)
bar.close() bar.close()
nltk_tokens = nltk.word_tokenize(pre_summery) nltk_tokens = nltk.word_tokenize(pre_summery)
print("\nNumber of Tokens: ", len(nltk_tokens)) print("\nNumber of Tokens: ", len(nltk_tokens))
print("Compression Ratio: ", round(len(nltk_tokens_init)/len(nltk_tokens), 1))
# Final Summary
#print(pre_summery)
system_prompt = "You are an expert summarizer. Your Job it to take all the individual sections under each bullet point. Make sure that the summary is long and detailed. Do not mention anything about sections or chunks and only summarize in paragraph form. Never let your summary's contain outlines or built points" system_prompt = "You are an expert summarizer. Your Job it to take all the individual sections under each bullet point. Make sure that the summary is long and detailed. Do not mention anything about sections or chunks and only summarize in paragraph form. Never let your summary's contain outlines or built points"
prompt = f""" prompt = f"""
@ -198,8 +190,6 @@ Here are a bunch of bulit points. Please summerize them:
final_summery = generate_text(model, prompt, system_prompt) final_summery = generate_text(model, prompt, system_prompt)
#print("\n\n\n------------------------------Final Summery-----------------------------------------------\n\n\n")
#print(final_summery)
print("Done") print("Done")
pdf.add_section(Section(f"## Basic Overview\n{final_summery}\n\n")) pdf.add_section(Section(f"## Basic Overview\n{final_summery}\n\n"))
@ -208,3 +198,6 @@ pdf.add_section(Section(f"## Code Outline\n{pre_summery}\n\n"))
pdf.meta["title"] = title pdf.meta["title"] = title
pdf.meta["author"] = "locker98" pdf.meta["author"] = "locker98"
pdf.save(f"{title}.pdf") pdf.save(f"{title}.pdf")
# print(f"\n\n\n{pre_summery}")