Wikipedie:GPT rozepisovač referencí

Následující program v Pythonu 3 je založený na umělé inteligenci. Projde zadaný článek české Wikipedie (jeho jméno zapište přímo do programu, jde o "holý" program bez uživatelského rozhraní) a vypíše jeho zdrojový kód po rozepsání referencí a odkazů na literaturu do citačních šablon. Nezapomeňte po něm vše zkontrolovat. Výsledek na článku Divadlo ve verzi před 16. květnem 2024 je vidět zde.
Protože program přes API provolává model řady GPT, je potřeba mít na stránkách firmy OpenAI zakoupeno právo model takto používat (https://openai.com/api/).
"""Zadáte článek a program rozepíše reference do šablon"""
import csv
import requests
import re
from openai import OpenAI
from io import StringIO
from bs4 import BeautifulSoup

client = OpenAI(api_key=my_api_key)   # za my_api_key dosadit klíč od firmy OpenAI - lze ho zakoupit na jejich webu

# Vstup:
article_name = "Můj článek"  # jméno článku v uvozovkách  


def get_wikipedia_source(article_title, language="cs"):
    """Gets the source code of a Wikipedia article"""
    # Define the endpoint and parameters
    endpoint = f"https://{language}.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "prop": "revisions",
        "titles": article_title,
        "rvprop": "content",
        "rvslots": "main"
    }

    # Make the request to the Wikipedia API
    response = requests.get(endpoint, params=params)
    
    if response.status_code != 200:
        raise Exception(f"Error fetching data from Wikipedia API: {response.status_code}")

    data = response.json()

    # Extract the page ID, as the structure of the response contains dynamic page IDs
    pages = data.get("query", {}).get("pages", {})
    
    if not pages:
        raise Exception("No pages found or an error occurred.")

    page_id = next(iter(pages))  # Get the first (and likely only) page ID key

    # Extract the content of the page
    page = pages[page_id]
    revisions = page.get("revisions", [])
    
    if not revisions:
        raise Exception("No revisions found for this page.")

    content = revisions[0].get("slots", {}).get("main", {}).get("*", "")
    
    return content


def reflist(wikitext):
    """Sestaví seznam všech referencí na základě wikitextu zadaného článku"""
    # wikitext = get_wikipedia_source(article_title, language)
    
    # find all refs
    pattern = r'<ref\b(?:(?!/>).)*?>(.*?)</ref>'
    references = re.findall(pattern, wikitext, re.DOTALL)  # Find all matches in the text using the pattern.
    references = [item.strip() for item in references]
    
    return references

    
def extract_literatura(wikitext):
    """Sestaví seznam položek literatury na základě wikitextu zadaného článku české Wikipedie"""
    #wikitext = get_wikipedia_source(article_title, language="cs")
    
    # Define a regex pattern to match the Literatura section
    section_pattern = r'==\s*Literatura\s*==\s*(.*?)\s*(==|$)'
    
    # Extract the Literatura section using the pattern
    section_match = re.search(section_pattern, wikitext, re.DOTALL | re.IGNORECASE)
        
    if section_match:
        literatura_section = section_match.group(1).strip()
        
        # Define a pattern to match multiline items starting with *
        items_pattern = r'^\*\s*(.*?)(?=^\*|\Z)'
        
        # Find all items in the Literatura section
        items = re.findall(items_pattern, literatura_section, re.DOTALL | re.MULTILINE)
        
        # Strip the surrounding whitespace and leading asterisks from each item
        items = [item.strip().lstrip('*').strip() for item in items]
        
        return items
    else:
        return []
    
    
def filter_references(wikitext):
    """
    Z článku české Wikipedie vytvoří filtrovaný seznam referencí:
    - Vypustí ty, které jsou kratší než 15 znaků
    - Vypustí duplicitní položky
    """
    references = reflist(wikitext)
    references += extract_literatura(wikitext)
    
    filtered_references = []
    seen = set()

    for reference in references:
        reference = reference.strip()
        
        # Kontrola, zda reference má alespoň 15 znaků
        if len(reference) >= 15:
            if reference not in seen:
                filtered_references.append(reference)
                seen.add(reference)
    
    return filtered_references
    
    
def unapostropher(retezec):
    """Stripne řetězec a pokud je uzavřen v apostrofech, odebere je"""
    s = retezec.strip()
    if s.startswith("'") and s.endswith("'"):
        # Strip one apostrophe from both ends
        return s[1:-1]
    else:
        return s
    
    
def reftemplate(wikitext):
    """Posoudí reference do šablon: zda obsahuji URL, citační šablonu a zda je šablona vyplněná OK"""
    
    refy = filter_references(wikitext)
    prompt = """The goal is to judge references in a Czech Wikipedia article.
You will be presented a list of references from the article. There is always the nuber of the item,
asterisk and the content. For instance two references may be

7 * Strube, Christine (1977) "Die Formgebung der Apsisdekoration in Qalbloze unde Qalat Siman" ''Jahrbuch 
für Antike und Christentum'' 20: pp.&nbsp;181&#x2013;191, page 187; in German
8 * Strube (1977), p. 188

You will write out a csv table about these items, where there are columns:
 - The number of the item ("7" in the first case above).
 - Whether it contains either a URL or a description of a resource ("Yes" as in the item 7) or 
   only a note / a shortened reference possibly with pages ("No" as in the item 8).
 - Whether it contains a URL; write the last mentioned URL or write "n/a" ("n/a" in the case above).
 - Whether it uses a Czech citation template (i.e. Citace monografie, Citace elektronické monografie, 
   Citace periodika, Citace elektronického periodika, Citace sborníku or Citace kvalifikační práce);  
   write "Good" if the template is used, write "Bad" if the template is not used, or write "n/a" if there 
   is no need for such template ("n/a" in the item 8 above).
 - If the reference contains a description of a resource, write which of the mentioned Czech citation templates 
   would be the best for the given resource ("Citace periodika" in the item 7), otherwise write "n/a" 
Do not comment the output nor the references. The list is:

    """

    vystup = ""
    for i in range(0, len(refy), 10):  # prezentuji reference po deseti
        blok = refy[i:i + 10]
        seznam = ""
        for index, string in enumerate(blok, start=i):
            seznam += str(index) + " * " + string
            
    
        response = client.chat.completions.create(
            model="gpt-4o",    
            max_tokens=4096,
            messages=[
                    {
                      "role": "system",
                      "content": [
                          {
                       "type": "text",
                       "text": "You are a csv generator. Your output can be only a comma separated table. Otherwise write nothing."
                          }
                        ]
                    },
                    {
                      "role": "user",
                      "content": prompt + seznam
                    },]
                  )
        msgtext = response.choices[0].message.content
        vystup += "\n" + msgtext

    # Convert the CSV content to a file-like object
    csv_file = StringIO(vystup)

    # Initialize the output dictionary
    outdict = {}

    # Define valid values
    valid_is_reference = {"yes", "no"}
    valid_quality = {"good", "bad", "n/a"}
    valid_template = {"n/a", "citace monografie", "citace elektronické monografie", "citace periodika", 
                      "citace elektronického periodika", "citace sborníku", "citace kvalifikační práce"}

    # Read the CSV file
    csv_reader = csv.reader(csv_file)

    error_found = False

    for row in csv_reader:
        try:
            idx = int(row[0])
            is_reference = row[1]
            url = row[2]
            quality = row[3]
            template = row[4]

            # Check values and set as "Error" if they're not valid
            rr = unapostropher(is_reference.lower().strip())
            if rr not in valid_is_reference:
                rr = "Error"
                error_found = True
            qq = unapostropher(quality.lower().strip())    
            if qq not in valid_quality:
                qq = "Error"
                error_found = True
            # Simple URL validation; you might want to use more sophisticated validation
            uu = unapostropher(url.lower().strip())
            if not (uu.startswith("http://") or uu.startswith("https://") 
                    or uu == "n/a"):
                url = "Error"
                error_found = True
            tt = unapostropher(template.lower().strip())    
            if tt not in valid_template:
                tt = "Error"
                error_found = True

            # Create the dictionary entry
            outdict[idx] = {
                "Reference": refy[idx],
                "Is_reference": rr,
                "URL": unapostropher(url),
                "Quality": qq,
                "Template": tt.capitalize(),
                "Row": str(row)
            }
        except (ValueError, IndexError):  # Ignore malformed lines
            continue

    if error_found:
        print("Error Found")

    # Output the resulting dictionary (for debugging/visualization)
    
    # ještě ošetřit sirotky v refy, které se nenapojily
    for i, r in enumerate(refy):
        if not i in outdict:
            print("Nenašel zpracovanou referenci " + r) 
            outdict[i] = {'Reference': r}
    
    #print(outdict)
    return outdict


def stahni_ukazku(url):
    """TBD Pokud se zadaří, vrátí výpis prvních 500 znaků URL, jinak prázdný řetězec"""
    # Fetch the web page content
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve URL: {response.status_code}")
        return ""
    
    # Parse the web page content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract metadata
    title = soup.title.string if soup.title else "No title"
    description_tag = soup.find('meta', attrs={'name': 'description'})
    description = description_tag['content'] if description_tag else "No description"
    keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
    keywords = keywords_tag['content'] if keywords_tag else "No keywords"

    # Extract raw text
    raw_text = soup.get_text(separator=' ', strip=True)
    raw_text_first = raw_text[:2000]
        
    # Compile the metadata and raw text
    metadata = f"Title: {title}\nDescription: {description}\nKeywords: {keywords}\n"
    sample_text = f"First characters of text: {raw_text_first}"

    result = f"{metadata}\n{sample_text}"

    return result
    

def reftidy(refdict):
    """Přidá reference, aby byly v českých citačních šablonách"""
    
    for i, r in refdict.items():
        if r['Quality'] == "good" or r['Is_reference'] == "no":  # hotové reference nebo nereference zkopíruje
            r['Changed'] = False

        else:
            r['Changed'] = True
            if not (r['URL'] == "n/a"):  # pokud má URL, stáhneme metadata a začátek textu webovky
                r['Web'] = stahni_ukazku(r['URL'])
    
    for i, r in refdict.items():
        if r['Changed']:
            newref = navrhni_ref(r)  # pošleme AI, aby navrhla referenci 
            if len(newref) > 0:
                r['Reformated'] = newref
            else:
                r['Changed'] = False  # neumí to rozumně nahradit
        
    return refdict



def navrhni_ref(r):
    """Navrhne, jak přepsat referenci do české citační šablony"""
    template = r['Template'].lower()
    template_promt = ""
    if (template == "n/a" and r['URL'] == "n/a") or template == "citace monografie":
        template_promt = """Example: {{Citace monografie | jméno = John Ronald Reuel | příjmení = Tolkien 
        | odkaz na autora = John Ronald Reuel Tolkien | titul = Pán prstenů: Dvě věže 
        | překladatelé = Stanislava Pošustová | edice = Třináct 
        | svazek edice = 195 | vydání = 1 | vydavatel = Mladá fronta | místo = Praha | rok vydání = 1991 
        | počet stran = 320 | kapitola = Rohanští jezdci | strany = 19 | isbn = 80-204-0194-6 
        | url= http://www.mf.cz }}"""
    if (template == "n/a" and r['URL'] != "n/a") or template == "citace elektronické monografie":
        template_promt = """Example: {{Citace elektronické monografie 
        | url=http://www.czso.cz/sldb/sldb.nsf/i/seznam_doporucenych_ukazatelu_scitani 
        | titul=Seznam doporučených ukazatelů sčítání | místo=Praha | vydavatel= [[Český statistický úřad]] 
        | datum aktualizace=2007-09-13 | datum přístupu=2007-09-16 }}"""
    if template == "citace periodika":
        template_promt = """Example: {{Citace periodika | příjmení = Smith | jméno = Joseph III 
        | titul = Last Testimony of Sister Emma | periodikum = The Saints' Herald | ročník = 26 | číslo = 19 
        | datum = 1879-10-01 | strany = 289 | url = http://www.lavazone2.com/dbroadhu/IL/sain1872.htm#100179 
        | datum přístupu = 2006-03-23 }}"""
    if template == "citace elektronického periodika":
        template_promt = """Example: {{Citace elektronického periodika | autor = Reuters 
        | titul = Favorité selhali, radoval se Ital Staudacher | redaktoři = kal | periodikum = iDNES.cz 
        | odkaz na periodikum = iDNES.cz | datum vydání = 2007-02-06 | datum přístupu = 2007-12-11 
        | url = http://sport.idnes.cz/favorite-selhali-radoval-se-ital-staudacher-fe9-}}"""
    if template == "citace sborníku":
        template_promt = """Example: {{Citace sborníku | příjmení = Price | jméno = Derek de Solla 
        | titul = A general theory of bibliometrics | příjmení sestavitele = Griffith 
        | jméno sestavitele = Belver C. | sborník = Key papers in information science | url= http://www.kip.com 
        | vydavatel = Knowledge Industry Publications | místo = New York | rok vydání = 1980 | strany = 177–191}}"""
    if template == "citace kvalifikační práce":
        template_promt = """Example: {{Citace kvalifikační práce | příjmení = Holý | jméno = Zdeněk 
        | instituce = Masarykova univerzita, Filozofická fakulta – Seminář dějin umění 
        | odkaz na instituci = Filozofická fakulta Masarykovy univerzity 
        | titul = Staří mistři | url = http://is.muni.cz/th/110849/ff_b/ | typ práce = Bakalářská práce 
        | vedoucí = Miloš Stehlík | odkaz na vedoucího = | místo = Brno | rok vydání = 2006 | počet stran = 61 
        | strany = 5 | datum přístupu = 2012-12-16 | poznámka = | jazyk = }}"""
        
    prompt = """You will get a reference from the Czech Wikipedia. You will rewrite it using a citation template.
    Write each item in the template on a new line. Do not add new information which is not deducible from 
    the original reference. Drop unused items in the template. Try to use all the pieces of information in 
    the original citation. 
    """    
    prompt += template_promt
    if 'Web' in r and len(r['Web']) > 10 and not "Example Domain Example" in r['Web']:
        prompt += "\nYou can use metadata and first characters of the text from the web page. They are:\n" + r['Web']
        
    prompt += "\nThe original Czech reference you are about to rewrite is: " + r['Reference']  
    
    response = client.chat.completions.create(
            model="gpt-4o",    # "gpt 3.5 je zřetelně horší
            max_tokens=1200,
            messages=[
                {
                  "role": "user",
                  "content": prompt
                },]
              )
    msgtext = response.choices[0].message.content
        
    if msgtext is not None:
        pattern = r'(?s)\{\{(.*?)\}\}'
        match = re.search(pattern, msgtext)

        if match:  # drop LLMs commentary if there is any ("Surely! Here's the template...")
            citation = match.group(0)  
        else:
            citation = ""
        return citation
    else:
        return ""
        

def nahradit(wikitext, refdict, zminka=True):
    """Nahrazení referencí ve zdrojáku stránky; zminka přidá ještě poznámku, co se nahradilo"""
    for key, item in refdict.items():
        if item['Changed']:
            original_ref = item['Reference']
            reformatted_ref = item['Reformated']
            
            if zminka:
                replacement = f"{reformatted_ref}<!--Automaticky nahrazena reference: {original_ref} -->"
            else:
                replacement = reformatted_ref

            wikitext = re.sub(re.escape(original_ref), replacement, wikitext)

    return wikitext    
        

# tady začíná vlastní práce
wikitext = get_wikipedia_source(article_name, language="cs")  # stáhne zdroják článku 

refdict = reftemplate(wikitext)  # vybere z něj reference a položky literatury (kromě extra krátkých) 

outdict = reftidy(refdict)  # obohatí seznam zdrojů a pokud nejsou v českých citačních šeblonách, rozepíše je do nich 

newtext = nahradit(wikitext, refdict)  #  ve zdrojovém textu článku nahradí původní odkazy těmi rozepsanými do šablon

print(newtext)  # vypíše upravený zdroják, z výstupu si ho uživatel překopíruje do "klasického" editoru Wikipedie a uloží