From 565ee99aac97d90ab101bee7a8ced1fe3239cbfa Mon Sep 17 00:00:00 2001
From: test <test@test.com>
Date: Thu, 18 Apr 2024 23:02:43 -0400
Subject: [PATCH] orginized

---
 code_chat => other-talk         | 49 ++++++------------
 code_chat copy => sql-code-talk | 89 +++++++++++++++------------------
 2 files changed, 55 insertions(+), 83 deletions(-)
 rename code_chat => other-talk (83%)
 rename code_chat copy => sql-code-talk (70%)

diff --git a/code_chat b/other-talk
similarity index 83%
rename from code_chat
rename to other-talk
index 8698265..81c3aab 100755
--- a/code_chat
+++ b/other-talk
@@ -1,25 +1,14 @@
 #!/usr/bin/env python
 
-#from langchain.embeddings import FastEmbedEmbeddings
-#from langchain.schema.output_parser import StrOutputParser
 from langchain.document_loaders import UnstructuredFileLoader, WebBaseLoader, YoutubeLoader, TextLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-#from langchain.schema.runnable import RunnablePassthrough
-#from langchain.prompts import PromptTemplate
-#from langchain.schema.document import Document
 from langchain.vectorstores.utils import filter_complex_metadata
-#from langchain_community.embeddings import OllamaEmbeddings
-#import mimetypes
 import os
 import json
 import requests
-#from pathlib import Path
-from rich.markdown import Markdown
-from rich.console import Console
 import re
 import sys
 from urllib.parse import urlparse, parse_qs
-#from youtube_transcript_api import YouTubeTranscriptApi
 import nltk
 from tqdm import tqdm
 from markdown_pdf import Section, MarkdownPdf
@@ -37,21 +26,23 @@ title = os.path.basename(filename).replace(file_extension, '')
 pdf.add_section(Section(f"# {title}\n", toc=True))
 
 
-model = "dolphin-mistral:latest"
-#model = "mistral:latest"
+model = "llama3:latest"
+# model = "mistral:latest"
 
 vector_store = None
 retriever = None
 chain = None
 docs = None
 
+
 def is_bulletpoint(s):
     for char in s[:5]:
         if char.isdigit():
             return True
     return False
 
-def generate_text(model, prompt, system = ""):
+
+def generate_text(model, prompt, system=""):
     url = "http://localhost:11434/api/generate"
     data = {
         "model": model,
@@ -84,15 +75,14 @@ def is_url(string):
 
 
 
-#text_splitter = RecursiveCharacterTextSplitter(chunk_size=2048, chunk_overlap=100)
+# text_splitter = RecursiveCharacterTextSplitter(chunk_size=2048, chunk_overlap=100)
 text_splitter = RecursiveCharacterTextSplitter(chunk_size=4096, chunk_overlap=100)
 
 # Checking if url or if file path
 if is_url(file_input):
-    
 
     # See if youtube link
-    if isyoutubevideo(file_input) == True:
+    if isyoutubevideo(file_input):
         print("Loading youtube video...")
         # Prepare youtube url for transcript extraction
         #parsed_url = urlparse(file_input)
@@ -108,7 +98,6 @@ if is_url(file_input):
         # Extract and load webpage text
         docs = WebBaseLoader(file_input).load()
 
-
     # Prepare text
     docs = text_splitter.split_documents(docs)
     docs = filter_complex_metadata(docs)
@@ -124,10 +113,6 @@ else:
     docs = filter_complex_metadata(docs)
 
 
-
-
-
-
 outline = ""
 pre_summery = ""
 
@@ -164,11 +149,11 @@ Write a paragraph summary of the following CHUNK of sql code.
 # CHUNK
 {chunk_text}
 """
-    
+
     outline = generate_text(model, prompt, system_prompt)
-    
+
     bullet_point = False
-    
+
     for x in outline.split("\n"):
         if is_bulletpoint(x):
             bullet_point = True
@@ -180,21 +165,17 @@ Write a paragraph summary of the following CHUNK of sql code.
     else:
         outline = outline.replace("\n", " ")
 
-    
     pre_summery += "\n\n" + str(count) + "." + outline
 
-    #print("\n\n--------------------------------------------------------------------------------------------")
-    #print(outline)
+    # print("\n\n--------------------------------------------------------------------------------------------")
+    # print(outline)
 
 
 bar.close()
 
 nltk_tokens = nltk.word_tokenize(pre_summery)
 print("\nNumber of Tokens: ", len(nltk_tokens))
-print("Compression Ratio: ", round(len(nltk_tokens_init)/len(nltk_tokens),1))
-
-
-
+print("Compression Ratio: ", round(len(nltk_tokens_init)/len(nltk_tokens), 1))
 
 
 # Final Summary
@@ -218,7 +199,5 @@ pdf.meta["title"] = title
 pdf.meta["author"] = "locker98"
 pdf.save(f"{title}.pdf")
 
+# print(f"\n\n\n{pre_summery}")
 
-
-
-#print(f"\n\n\n{pre_summery}")
\ No newline at end of file
diff --git a/code_chat copy b/sql-code-talk
similarity index 70%
rename from code_chat copy
rename to sql-code-talk
index 0f0c534..81c3aab 100755
--- a/code_chat copy	
+++ b/sql-code-talk
@@ -1,25 +1,14 @@
 #!/usr/bin/env python
 
-#from langchain.embeddings import FastEmbedEmbeddings
-#from langchain.schema.output_parser import StrOutputParser
 from langchain.document_loaders import UnstructuredFileLoader, WebBaseLoader, YoutubeLoader, TextLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-#from langchain.schema.runnable import RunnablePassthrough
-#from langchain.prompts import PromptTemplate
-#from langchain.schema.document import Document
 from langchain.vectorstores.utils import filter_complex_metadata
-#from langchain_community.embeddings import OllamaEmbeddings
-#import mimetypes
 import os
 import json
 import requests
-#from pathlib import Path
-from rich.markdown import Markdown
-from rich.console import Console
 import re
 import sys
 from urllib.parse import urlparse, parse_qs
-#from youtube_transcript_api import YouTubeTranscriptApi
 import nltk
 from tqdm import tqdm
 from markdown_pdf import Section, MarkdownPdf
@@ -37,15 +26,23 @@ title = os.path.basename(filename).replace(file_extension, '')
 pdf.add_section(Section(f"# {title}\n", toc=True))
 
 
-model = "dolphin-mistral:latest"
-#model = "mistral:latest"
+model = "llama3:latest"
+# model = "mistral:latest"
 
 vector_store = None
 retriever = None
 chain = None
 docs = None
 
-def generate_text(model, prompt, system = ""):
+
+def is_bulletpoint(s):
+    for char in s[:5]:
+        if char.isdigit():
+            return True
+    return False
+
+
+def generate_text(model, prompt, system=""):
     url = "http://localhost:11434/api/generate"
     data = {
         "model": model,
@@ -53,7 +50,7 @@ def generate_text(model, prompt, system = ""):
         "system": system,
         "stream": False,
         "options": {
-            "temperature": 0.6,
+            "temperature": 0.4,
         }
     }
     response = requests.post(url, json=data)
@@ -78,15 +75,14 @@ def is_url(string):
 
 
 
-#text_splitter = RecursiveCharacterTextSplitter(chunk_size=2048, chunk_overlap=100)
+# text_splitter = RecursiveCharacterTextSplitter(chunk_size=2048, chunk_overlap=100)
 text_splitter = RecursiveCharacterTextSplitter(chunk_size=4096, chunk_overlap=100)
 
 # Checking if url or if file path
 if is_url(file_input):
-    
 
     # See if youtube link
-    if isyoutubevideo(file_input) == True:
+    if isyoutubevideo(file_input):
         print("Loading youtube video...")
         # Prepare youtube url for transcript extraction
         #parsed_url = urlparse(file_input)
@@ -102,7 +98,6 @@ if is_url(file_input):
         # Extract and load webpage text
         docs = WebBaseLoader(file_input).load()
 
-
     # Prepare text
     docs = text_splitter.split_documents(docs)
     docs = filter_complex_metadata(docs)
@@ -118,10 +113,6 @@ else:
     docs = filter_complex_metadata(docs)
 
 
-
-
-
-
 outline = ""
 pre_summery = ""
 
@@ -134,8 +125,8 @@ t = ""
 for a in docs:
     t += a.page_content
 
-nltk_tokens = nltk.word_tokenize(t)
-print("Number of Tokens: " + str(len(nltk_tokens)) + "\n")
+nltk_tokens_init = nltk.word_tokenize(t)
+print("Number of Tokens: " + str(len(nltk_tokens_init)) + "\n")
 
 
 bar = tqdm(desc="Loading…", ascii=False, ncols=100, total=len(docs))
@@ -149,44 +140,45 @@ for x in docs:
 
     system_prompt = """
 You are a professional code summarizer. You will be be given a SQL query in chunk section.
-Take each chunk and create a very short concise summery. The chunk will be under the # CHUNK heading. Only output the summery.
-Do not under any circumstance output the # CHUNK section or any SQL code.
+Take each chunk and create a very short concise single paragraph summery. The chunk will be under the # CHUNK heading. Only output the summery.
+Do not under any circumstance output the # CHUNK section, SQL code, or bullet points.
 """
-
-
     prompt = f"""
+Write a paragraph summary of the following CHUNK of sql code.
+
 # CHUNK
 {chunk_text}
-
 """
 
-## SUMMERY
-#{outline}
-#"""
-
     outline = generate_text(model, prompt, system_prompt)
 
-    outline = outline.replace("\n", " ")
+    bullet_point = False
 
-    pre_summery += "\n\n" + str(count) + ". " + outline
-    #print("\n\n\n")
-    #print(outline)
-    #print("\n\n\n")
+    for x in outline.split("\n"):
+        if is_bulletpoint(x):
+            bullet_point = True
 
+    if is_bulletpoint(outline.split("\n")[0]):
+        outline =  " The SQL script performs the following tasks:\n" + outline.replace("\n", "\n\t")
+    elif bullet_point:
+        outline = outline.replace("\n", "\n\t")
+    else:
+        outline = outline.replace("\n", " ")
+
+    pre_summery += "\n\n" + str(count) + "." + outline
+
+    # print("\n\n--------------------------------------------------------------------------------------------")
+    # print(outline)
 
-#print("\n\n\n-----------------------------------------------------------------------------------\n\n\n")
-#print(pre_summery)
 
 bar.close()
 
 nltk_tokens = nltk.word_tokenize(pre_summery)
 print("\nNumber of Tokens: ", len(nltk_tokens))
+print("Compression Ratio: ", round(len(nltk_tokens_init)/len(nltk_tokens), 1))
 
 
-
-#print(pre_summery)
-
-
+# Final Summary
 system_prompt = "You are an expert summarizer. Your Job it to take all the individual sections under each bullet point. Make sure that the summary is long and detailed. Do not mention anything about sections or chunks and only summarize in paragraph form. Never let your summary's contain outlines or built points"
 
 prompt = f"""
@@ -198,8 +190,6 @@ Here are a bunch of bulit points. Please summerize them:
 final_summery = generate_text(model, prompt, system_prompt)
 
 
-#print("\n\n\n------------------------------Final Summery-----------------------------------------------\n\n\n")
-#print(final_summery)
 print("Done")
 
 pdf.add_section(Section(f"## Basic Overview\n{final_summery}\n\n"))
@@ -207,4 +197,7 @@ pdf.add_section(Section(f"## Code Outline\n{pre_summery}\n\n"))
 
 pdf.meta["title"] = title
 pdf.meta["author"] = "locker98"
-pdf.save(f"{title}.pdf")
\ No newline at end of file
+pdf.save(f"{title}.pdf")
+
+# print(f"\n\n\n{pre_summery}")
+