From 786f3934f996843748a192006a31796d24e3d7ca Mon Sep 17 00:00:00 2001 From: test Date: Thu, 18 Apr 2024 22:37:11 -0400 Subject: [PATCH] first commit --- .gitignore | 2 + code_chat | 224 +++++++++++++++++++++++++++++++++++++++++++++++++ code_chat copy | 210 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 436 insertions(+) create mode 100644 .gitignore create mode 100755 code_chat create mode 100755 code_chat copy diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bd98a73 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.pdf + diff --git a/code_chat b/code_chat new file mode 100755 index 0000000..8698265 --- /dev/null +++ b/code_chat @@ -0,0 +1,224 @@ +#!/usr/bin/env python + +#from langchain.embeddings import FastEmbedEmbeddings +#from langchain.schema.output_parser import StrOutputParser +from langchain.document_loaders import UnstructuredFileLoader, WebBaseLoader, YoutubeLoader, TextLoader +from langchain.text_splitter import RecursiveCharacterTextSplitter +#from langchain.schema.runnable import RunnablePassthrough +#from langchain.prompts import PromptTemplate +#from langchain.schema.document import Document +from langchain.vectorstores.utils import filter_complex_metadata +#from langchain_community.embeddings import OllamaEmbeddings +#import mimetypes +import os +import json +import requests +#from pathlib import Path +from rich.markdown import Markdown +from rich.console import Console +import re +import sys +from urllib.parse import urlparse, parse_qs +#from youtube_transcript_api import YouTubeTranscriptApi +import nltk +from tqdm import tqdm +from markdown_pdf import Section, MarkdownPdf + + +pdf = MarkdownPdf(toc_level=2) + + +file_input = sys.argv[1] + +filename, file_extension = os.path.splitext(file_input) +title = os.path.basename(filename).replace(file_extension, '') + + +pdf.add_section(Section(f"# {title}\n", toc=True)) + + +model = "dolphin-mistral:latest" +#model = "mistral:latest" + +vector_store = None +retriever = None +chain = None +docs = None + +def is_bulletpoint(s): + for char in s[:5]: + if char.isdigit(): + return True + return False + +def generate_text(model, prompt, system = ""): + url = "http://localhost:11434/api/generate" + data = { + "model": model, + "prompt": prompt, + "system": system, + "stream": False, + "options": { + "temperature": 0.4, + } + } + response = requests.post(url, json=data) + text = json.loads(response.text) + return text["response"] + + +def isyoutubevideo(youtube_url): + parsed_url = urlparse(youtube_url) + query_params = parse_qs(parsed_url.query) + if 'v' in query_params: + return True + elif "youtu.be" in parsed_url: + return True + else: + return False + + +def is_url(string): + pattern = r"^https?://" + return bool(re.search(pattern, string)) + + + +#text_splitter = RecursiveCharacterTextSplitter(chunk_size=2048, chunk_overlap=100) +text_splitter = RecursiveCharacterTextSplitter(chunk_size=4096, chunk_overlap=100) + +# Checking if url or if file path +if is_url(file_input): + + + # See if youtube link + if isyoutubevideo(file_input) == True: + print("Loading youtube video...") + # Prepare youtube url for transcript extraction + #parsed_url = urlparse(file_input) + #query_params = parse_qs(parsed_url.query) + # Get youtube video id + #video_id = query_params['v'][0] + # Load for emmbeddings + video_id = file_input[-11:] + docs = YoutubeLoader(video_id).load() + + else: + print("Loading url...") + # Extract and load webpage text + docs = WebBaseLoader(file_input).load() + + + # Prepare text + docs = text_splitter.split_documents(docs) + docs = filter_complex_metadata(docs) + +else: + # Load File + try: + docs = UnstructuredFileLoader(file_input).load() + except: + docs = TextLoader(file_input).load() + # Prepare file + docs = text_splitter.split_documents(docs) + docs = filter_complex_metadata(docs) + + + + + + +outline = "" +pre_summery = "" + + +print("\nNumber of Chunks: ", len(docs)) + + +t = "" + +for a in docs: + t += a.page_content + +nltk_tokens_init = nltk.word_tokenize(t) +print("Number of Tokens: " + str(len(nltk_tokens_init)) + "\n") + + +bar = tqdm(desc="Loading…", ascii=False, ncols=100, total=len(docs)) +count = 0 +for x in docs: + count += 1 + bar.update() + context = str(x.page_content) + + chunk_text = context + + system_prompt = """ +You are a professional code summarizer. You will be be given a SQL query in chunk section. +Take each chunk and create a very short concise single paragraph summery. The chunk will be under the # CHUNK heading. Only output the summery. +Do not under any circumstance output the # CHUNK section, SQL code, or bullet points. +""" + prompt = f""" +Write a paragraph summary of the following CHUNK of sql code. + +# CHUNK +{chunk_text} +""" + + outline = generate_text(model, prompt, system_prompt) + + bullet_point = False + + for x in outline.split("\n"): + if is_bulletpoint(x): + bullet_point = True + + if is_bulletpoint(outline.split("\n")[0]): + outline = " The SQL script performs the following tasks:\n" + outline.replace("\n", "\n\t") + elif bullet_point: + outline = outline.replace("\n", "\n\t") + else: + outline = outline.replace("\n", " ") + + + pre_summery += "\n\n" + str(count) + "." + outline + + #print("\n\n--------------------------------------------------------------------------------------------") + #print(outline) + + +bar.close() + +nltk_tokens = nltk.word_tokenize(pre_summery) +print("\nNumber of Tokens: ", len(nltk_tokens)) +print("Compression Ratio: ", round(len(nltk_tokens_init)/len(nltk_tokens),1)) + + + + + +# Final Summary +system_prompt = "You are an expert summarizer. Your Job it to take all the individual sections under each bullet point. Make sure that the summary is long and detailed. Do not mention anything about sections or chunks and only summarize in paragraph form. Never let your summary's contain outlines or built points" + +prompt = f""" +Here are a bunch of bulit points. Please summerize them: + +{pre_summery} +""" + +final_summery = generate_text(model, prompt, system_prompt) + + +print("Done") + +pdf.add_section(Section(f"## Basic Overview\n{final_summery}\n\n")) +pdf.add_section(Section(f"## Code Outline\n{pre_summery}\n\n")) + +pdf.meta["title"] = title +pdf.meta["author"] = "locker98" +pdf.save(f"{title}.pdf") + + + + +#print(f"\n\n\n{pre_summery}") \ No newline at end of file diff --git a/code_chat copy b/code_chat copy new file mode 100755 index 0000000..0f0c534 --- /dev/null +++ b/code_chat copy @@ -0,0 +1,210 @@ +#!/usr/bin/env python + +#from langchain.embeddings import FastEmbedEmbeddings +#from langchain.schema.output_parser import StrOutputParser +from langchain.document_loaders import UnstructuredFileLoader, WebBaseLoader, YoutubeLoader, TextLoader +from langchain.text_splitter import RecursiveCharacterTextSplitter +#from langchain.schema.runnable import RunnablePassthrough +#from langchain.prompts import PromptTemplate +#from langchain.schema.document import Document +from langchain.vectorstores.utils import filter_complex_metadata +#from langchain_community.embeddings import OllamaEmbeddings +#import mimetypes +import os +import json +import requests +#from pathlib import Path +from rich.markdown import Markdown +from rich.console import Console +import re +import sys +from urllib.parse import urlparse, parse_qs +#from youtube_transcript_api import YouTubeTranscriptApi +import nltk +from tqdm import tqdm +from markdown_pdf import Section, MarkdownPdf + + +pdf = MarkdownPdf(toc_level=2) + + +file_input = sys.argv[1] + +filename, file_extension = os.path.splitext(file_input) +title = os.path.basename(filename).replace(file_extension, '') + + +pdf.add_section(Section(f"# {title}\n", toc=True)) + + +model = "dolphin-mistral:latest" +#model = "mistral:latest" + +vector_store = None +retriever = None +chain = None +docs = None + +def generate_text(model, prompt, system = ""): + url = "http://localhost:11434/api/generate" + data = { + "model": model, + "prompt": prompt, + "system": system, + "stream": False, + "options": { + "temperature": 0.6, + } + } + response = requests.post(url, json=data) + text = json.loads(response.text) + return text["response"] + + +def isyoutubevideo(youtube_url): + parsed_url = urlparse(youtube_url) + query_params = parse_qs(parsed_url.query) + if 'v' in query_params: + return True + elif "youtu.be" in parsed_url: + return True + else: + return False + + +def is_url(string): + pattern = r"^https?://" + return bool(re.search(pattern, string)) + + + +#text_splitter = RecursiveCharacterTextSplitter(chunk_size=2048, chunk_overlap=100) +text_splitter = RecursiveCharacterTextSplitter(chunk_size=4096, chunk_overlap=100) + +# Checking if url or if file path +if is_url(file_input): + + + # See if youtube link + if isyoutubevideo(file_input) == True: + print("Loading youtube video...") + # Prepare youtube url for transcript extraction + #parsed_url = urlparse(file_input) + #query_params = parse_qs(parsed_url.query) + # Get youtube video id + #video_id = query_params['v'][0] + # Load for emmbeddings + video_id = file_input[-11:] + docs = YoutubeLoader(video_id).load() + + else: + print("Loading url...") + # Extract and load webpage text + docs = WebBaseLoader(file_input).load() + + + # Prepare text + docs = text_splitter.split_documents(docs) + docs = filter_complex_metadata(docs) + +else: + # Load File + try: + docs = UnstructuredFileLoader(file_input).load() + except: + docs = TextLoader(file_input).load() + # Prepare file + docs = text_splitter.split_documents(docs) + docs = filter_complex_metadata(docs) + + + + + + +outline = "" +pre_summery = "" + + +print("\nNumber of Chunks: ", len(docs)) + + +t = "" + +for a in docs: + t += a.page_content + +nltk_tokens = nltk.word_tokenize(t) +print("Number of Tokens: " + str(len(nltk_tokens)) + "\n") + + +bar = tqdm(desc="Loading…", ascii=False, ncols=100, total=len(docs)) +count = 0 +for x in docs: + count += 1 + bar.update() + context = str(x.page_content) + + chunk_text = context + + system_prompt = """ +You are a professional code summarizer. You will be be given a SQL query in chunk section. +Take each chunk and create a very short concise summery. The chunk will be under the # CHUNK heading. Only output the summery. +Do not under any circumstance output the # CHUNK section or any SQL code. +""" + + + prompt = f""" +# CHUNK +{chunk_text} + +""" + +## SUMMERY +#{outline} +#""" + + outline = generate_text(model, prompt, system_prompt) + + outline = outline.replace("\n", " ") + + pre_summery += "\n\n" + str(count) + ". " + outline + #print("\n\n\n") + #print(outline) + #print("\n\n\n") + + +#print("\n\n\n-----------------------------------------------------------------------------------\n\n\n") +#print(pre_summery) + +bar.close() + +nltk_tokens = nltk.word_tokenize(pre_summery) +print("\nNumber of Tokens: ", len(nltk_tokens)) + + + +#print(pre_summery) + + +system_prompt = "You are an expert summarizer. Your Job it to take all the individual sections under each bullet point. Make sure that the summary is long and detailed. Do not mention anything about sections or chunks and only summarize in paragraph form. Never let your summary's contain outlines or built points" + +prompt = f""" +Here are a bunch of bulit points. Please summerize them: + +{pre_summery} +""" + +final_summery = generate_text(model, prompt, system_prompt) + + +#print("\n\n\n------------------------------Final Summery-----------------------------------------------\n\n\n") +#print(final_summery) +print("Done") + +pdf.add_section(Section(f"## Basic Overview\n{final_summery}\n\n")) +pdf.add_section(Section(f"## Code Outline\n{pre_summery}\n\n")) + +pdf.meta["title"] = title +pdf.meta["author"] = "locker98" +pdf.save(f"{title}.pdf") \ No newline at end of file