204 lines
5.1 KiB
Python
Executable File
204 lines
5.1 KiB
Python
Executable File
#!/usr/bin/env python
|
|
|
|
from langchain.document_loaders import UnstructuredFileLoader, WebBaseLoader, YoutubeLoader, TextLoader
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
from langchain.vectorstores.utils import filter_complex_metadata
|
|
import os
|
|
import json
|
|
import requests
|
|
import re
|
|
import sys
|
|
from urllib.parse import urlparse, parse_qs
|
|
import nltk
|
|
from tqdm import tqdm
|
|
from markdown_pdf import Section, MarkdownPdf
|
|
|
|
|
|
pdf = MarkdownPdf(toc_level=2)
|
|
|
|
|
|
file_input = sys.argv[1]
|
|
|
|
filename, file_extension = os.path.splitext(file_input)
|
|
title = os.path.basename(filename).replace(file_extension, '')
|
|
|
|
|
|
pdf.add_section(Section(f"# {title}\n", toc=True))
|
|
|
|
|
|
model = "llama3:latest"
|
|
# model = "mistral:latest"
|
|
|
|
vector_store = None
|
|
retriever = None
|
|
chain = None
|
|
docs = None
|
|
|
|
|
|
def is_bulletpoint(s):
|
|
for char in s[:5]:
|
|
if char.isdigit():
|
|
return True
|
|
return False
|
|
|
|
|
|
def generate_text(model, prompt, system=""):
|
|
url = "http://100.:11434/api/generate"
|
|
data = {
|
|
"model": model,
|
|
"prompt": prompt,
|
|
"system": system,
|
|
"stream": False,
|
|
"options": {
|
|
"temperature": 0.4,
|
|
}
|
|
}
|
|
response = requests.post(url, json=data)
|
|
text = json.loads(response.text)
|
|
return text["response"]
|
|
|
|
|
|
def isyoutubevideo(youtube_url):
|
|
parsed_url = urlparse(youtube_url)
|
|
query_params = parse_qs(parsed_url.query)
|
|
if 'v' in query_params:
|
|
return True
|
|
elif "youtu.be" in parsed_url:
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
|
|
def is_url(string):
|
|
pattern = r"^https?://"
|
|
return bool(re.search(pattern, string))
|
|
|
|
|
|
|
|
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=2048, chunk_overlap=100)
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=4096, chunk_overlap=100)
|
|
|
|
# Checking if url or if file path
|
|
if is_url(file_input):
|
|
|
|
# See if youtube link
|
|
if isyoutubevideo(file_input):
|
|
print("Loading youtube video...")
|
|
# Prepare youtube url for transcript extraction
|
|
#parsed_url = urlparse(file_input)
|
|
#query_params = parse_qs(parsed_url.query)
|
|
# Get youtube video id
|
|
#video_id = query_params['v'][0]
|
|
# Load for emmbeddings
|
|
video_id = file_input[-11:]
|
|
docs = YoutubeLoader(video_id).load()
|
|
|
|
else:
|
|
print("Loading url...")
|
|
# Extract and load webpage text
|
|
docs = WebBaseLoader(file_input).load()
|
|
|
|
# Prepare text
|
|
docs = text_splitter.split_documents(docs)
|
|
docs = filter_complex_metadata(docs)
|
|
|
|
else:
|
|
# Load File
|
|
try:
|
|
docs = UnstructuredFileLoader(file_input).load()
|
|
except:
|
|
docs = TextLoader(file_input).load()
|
|
# Prepare file
|
|
docs = text_splitter.split_documents(docs)
|
|
docs = filter_complex_metadata(docs)
|
|
|
|
|
|
outline = ""
|
|
pre_summery = ""
|
|
|
|
|
|
print("\nNumber of Chunks: ", len(docs))
|
|
|
|
|
|
t = ""
|
|
|
|
for a in docs:
|
|
t += a.page_content
|
|
|
|
nltk_tokens_init = nltk.word_tokenize(t)
|
|
print("Number of Tokens: " + str(len(nltk_tokens_init)) + "\n")
|
|
|
|
|
|
bar = tqdm(desc="Loading…", ascii=False, ncols=100, total=len(docs))
|
|
count = 0
|
|
for x in docs:
|
|
count += 1
|
|
bar.update()
|
|
context = str(x.page_content)
|
|
|
|
chunk_text = context
|
|
|
|
system_prompt = """
|
|
You are a professional summarizer. You will be be given a information in chunk section.
|
|
Take each chunk and create a very short concise single paragraph summery. The chunk will be under the # CHUNK heading. Only output the summery.
|
|
Do not under any circumstance output the # CHUNK section, or bullet points.
|
|
"""
|
|
prompt = f"""
|
|
Write a paragraph summary of the following CHUNK of text.
|
|
|
|
# CHUNK
|
|
{chunk_text}
|
|
"""
|
|
|
|
outline = generate_text(model, prompt, system_prompt)
|
|
|
|
bullet_point = False
|
|
|
|
for x in outline.split("\n"):
|
|
if is_bulletpoint(x):
|
|
bullet_point = True
|
|
|
|
if is_bulletpoint(outline.split("\n")[0]):
|
|
outline = "The text contains the following information:\n" + outline.replace("\n", "\n\t")
|
|
elif bullet_point:
|
|
outline = outline.replace("\n", "\n\t")
|
|
else:
|
|
outline = outline.replace("\n", " ")
|
|
|
|
pre_summery += "\n\n" + str(count) + "." + outline
|
|
|
|
# print("\n\n--------------------------------------------------------------------------------------------")
|
|
# print(outline)
|
|
|
|
|
|
bar.close()
|
|
|
|
nltk_tokens = nltk.word_tokenize(pre_summery)
|
|
print("\nNumber of Tokens: ", len(nltk_tokens))
|
|
print("Compression Ratio: ", round(len(nltk_tokens_init)/len(nltk_tokens), 1))
|
|
|
|
|
|
# Final Summary
|
|
system_prompt = "You are an expert summarizer. Your job is to take all the individual sections under each bullet point. Make sure that the summary is long and detailed. Do not mention anything about sections or chunks and only summarize in paragraph form. Never let your summary's contain outlines or built points"
|
|
|
|
prompt = f"""
|
|
Here are a bunch of bulit points. Please summerize them in paragraph form:
|
|
|
|
{pre_summery}
|
|
"""
|
|
|
|
final_summery = generate_text(model, prompt, system_prompt)
|
|
|
|
|
|
print("Done")
|
|
|
|
pdf.add_section(Section(f"## Basic Overview\n{final_summery}\n\n"))
|
|
pdf.add_section(Section(f"## Outline\n{pre_summery}\n\n"))
|
|
|
|
pdf.meta["title"] = title
|
|
pdf.meta["author"] = "locker98"
|
|
pdf.save(f"{title}.pdf")
|
|
|
|
# print(f"\n\n\n{pre_summery}")
|
|
|