Wikipedie:GPT vypisovač informací a referencí z interwiki

Z Wikipedie, otevřené encyklopedie

Následující program v Pythonu 3 je založený na umělé inteligenci. Zdáte článek české Wikipedie, který má interwiki (jeho jméno zapište přímo do programu, jde o "holý" program bez uživatelského rozhraní). Program seznam informací, které v interwiki nalezl, a případné reference. Vyžaduje přístup k placeným API firmy OpenAI, což je potřeba si zakoupit na jejich webu.

Ukázky výstupu jsou na Diskuse:Acidifikace a Diskuse:Agama australská.


import requests
import re
import pprint
import csv
from openai import OpenAI
from io import StringIO
from collections import defaultdict

client = OpenAI(api_key=my_api_key)   # za my_api_key dosadit klíč od firmy OpenAI - lze ho zakoupit na jejich webu

"""
Program najde n nejdelších interwiki zadaného článku. Z nich vypíše informace, 
zhodnotí je podle důležitosti vzhledem k tématu článku, 
uvede k nim příslušné zdroje, pokud tam jsou, a kapitolu, kde se vyskytují.
Výpis je v podobě wikitabulky, kterou si můžete vložit na pískoviště nebo
na diskusní stránku analyzovaného článku.
"""

# vstupy
article_title = "Jan Čep"  # jméno vstupního článku v uvozovkách  
language = "cs"  # Wikipedie, na které je
n = 3  # počet největších interwiki článků, které se mají zohlednit
output_unsourced_facts = False  # zda se mají vypsat i nezdrojované informace (True) nebo ne (False)
output_all_columns = False  # zda se mají vypsat všechny sloupce (True) nebo jen nejdůležitější (False) 

def get_wikipedia_source(article_title, language="cs"):
    """Gets the source code of a Wikipedia article"""
    # Define the endpoint and parameters
    endpoint = f"https://{language}.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "prop": "revisions",
        "titles": article_title,
        "rvprop": "content",
        "rvslots": "main"
    }

    # Make the request to the Wikipedia API
    response = requests.get(endpoint, params=params)
    
    if response.status_code != 200:
        raise Exception(f"Error fetching data from Wikipedia API: {response.status_code}")

    data = response.json()

    # Extract the page ID, as the structure of the response contains dynamic page IDs
    pages = data.get("query", {}).get("pages", {})
    
    if not pages:
        raise Exception("No pages found or an error occurred.")

    page_id = next(iter(pages))  # Get the first (and likely only) page ID key

    # Extract the content of the page
    page = pages[page_id]
    revisions = page.get("revisions", [])
    
    if not revisions:
        raise Exception("No revisions found for this page.")

    content = revisions[0].get("slots", {}).get("main", {}).get("*", "")
    
    return content


def split_chapters(wikitext):
    """
    Splits a source of a Wikipedia article into chapters of == level 2 ==, creates a dictionary of them
    """
    # Splitting text into lines for easier processing
    lines = wikitext.split('\n')
    
    # Regex patterns to identify headings
    heading_pattern = re.compile(r'^(=+)\s*(.*?)\s*(=+)$')
    heading_pattern = re.compile(r'^(=+)\s*(.*?)\s*(=+)')

    # Dictionary to store chapter information
    chapters = defaultdict(lambda: {"Text": "", "Length": 0, "Heading": ""})
    
    # Initial variables to keep track of current chapter and its content
    current_chapter = 0
    chapter_text = []
    chapter_heading = ""

    for line in lines:
        heading_match = heading_pattern.match(line)
        if heading_match:
            # If we reach a new heading, save the previous chapter's information
            if chapter_text or chapter_heading:
                chapters[current_chapter]["Text"] = "\n".join(chapter_text)
                chapters[current_chapter]["Length"] = len(chapters[current_chapter]["Text"])
                chapters[current_chapter]["Heading"] = chapter_heading

            # Determine the chapter number by heading level
            level = len(heading_match.group(1))
            if level == 2:
                current_chapter += 1 if chapters[current_chapter]["Text"] else 0
                chapter_heading = heading_match.group(2).strip()
                chapter_text = [line]
            elif level > 2:
                chapter_text.append(line)

        else:
            # Add line to current chapter text
            chapter_text.append(line)

    # Save the last chapter's information
    if chapter_text or chapter_heading:
        chapters[current_chapter]["Text"] = "\n".join(chapter_text)
        chapters[current_chapter]["Length"] = len(chapters[current_chapter]["Text"])
        chapters[current_chapter]["Heading"] = chapter_heading

    return dict(chapters)
    
    
def unapostropher(retezec):
    """Stripne řetězec a pokud je uzavřen v apostrofech, odebere je"""
    s = retezec.strip()
    if s.startswith("'") and s.endswith("'"):
        # Strip one apostrophe from both ends
        return s[1:-1]
    else:
        return s
    

def reftemplate(chapters, article, language):
    """Projede kapitoly wikičlánku a vytvoří seznam obsažených faktů a jejich referencí
    argumenty: dictionary kapitol a název článku"""
    
    prompt = """Your task is to extract facts and their references from a chapter of a Wikipedia article.
Try to extract as many facts as possible. If the chapter has no fluent text (it contains tables, pictures, 
lists, external links...), you will print nothing.

You will create a tab delimited table. You will create as many rows as needed.

"""
    prompt += f"The first column is the short statement of the fact you have found. Write in the language \
      of the article, that is {language}. Be succinct and use as much the original language of the text \
      as possible. Write in complete self-sufficient sentences."
    
    prompt += """
The second column is the same but in Czech language. Translate simply the fist column.

The third column is the reference or references used to support the fact in the text. Copy it as is including
the <ref> tags and/or citation templates like {{Harv|Blust|1999|p=12}}. If the fact is not referenced, 
left the column empty. 

The fourth column is the importance of the fact regarding the theme of the article: "Very important" if it should 
be mentioned even in a very short article about the subject. "Important" if it should be mentioned in an average
Wikipedia article. "Useful" if it belongs in a very detailed account only. "Unimportant" if it has only indirect
relevance.

Do not comment the output nor the references.
    """

    prompt += f"\nThe theme of the article is {article}, its language is {language}. The text of the chapter follows:\n"


    vystup = ""
    for i in range(len(chapters)):  # prochází kapitoly            
        chap = chapters[i]
        
        response = client.chat.completions.create(
            model="gpt-4o",    
            max_tokens=4096,
            messages=[
                    {
                      "role": "system",
                      "content": [
                          {
                       "type": "text",
                       "text": """You are a csv generator. Your output can be only a tab separated 
                               table without a heading and without a type description starting from 
                               the first line on. Otherwise write nothing."""
                          }
                        ]
                    },
                    {
                      "role": "user",
                      "content": prompt + chap["Text"]
                    },]
                  )
        vystup = response.choices[0].message.content
        csv_file = StringIO(vystup)  # Convert the CSV content to a file-like object

        # Read the CSV file
        csv_reader = csv.reader(csv_file, delimiter='\t')

        out, cnt = {}, 0
        
        for row in csv_reader:
            
            errorfree = True
            
            try:
                fact = unapostropher(row[0].strip())
                fact_cs = unapostropher(row[1].strip())
                ref = unapostropher(row[2].strip())
                importance = unapostropher(row[3].strip()).lower().capitalize()

                # Check values and set as "Error" if they're not valid
                if importance not in  {"Very important", "Important", "Useful", "Unimportant"}:
                    importance = "Error"
                    errorfree = False

                # Create the dictionary entry
                if errorfree:
                    out[cnt] = {
                        "Fact": fact,
                        "Fact_cs": fact_cs,
                        "Reference": ref,
                        "Importance": importance
                    }

                    cnt += 1
                else:    
                    print("Error Found in " + chap["Heading"]) 

            except (ValueError, IndexError):  # Ignore malformed lines
                continue

        chap["Facts"] = out

    return chapters


def factlist(facts, article_title, language):
    """Transforms the facts into a list of dictionaties and possibly prints them"""
    
    faclist = []
    
    for i in range(len(facts)):
        chap = facts[i]
        if not chap["Facts"] == {}:
            for j in range(len(chap["Facts"])):
                f = chap["Facts"][j]  # A single fact, at last
                f["Chapter_number"] = i
                f["Fact_number"] = j
                f["Heading"] = chap["Heading"]
                f["Chapter_length"] = chap["Length"]
                f["Article_title"] = article_title
                f["Language"] = language
                
                faclist.append(f)
                
    return faclist            
        

def list_facts(article_title, language):
    """Vypíše fakta a jejich zdroje a kapitoly ze zadaného článku Wikipedie"""
    wikitext = get_wikipedia_source(article_title, language)  # Gets the article
    chapters = split_chapters(wikitext)  # Splits the source by chapters
    facts = reftemplate(chapters, article_title, language)  # Selects the facts  
    vystup = factlist(facts, article_title, language)  # Transform the output to a simpler structure  
    return vystup


def get_article_length(article_name, language='en'):
    """How long is a given article"""
    url = f"https://{language}.wikipedia.org/w/api.php?action=query&format=json&prop=extracts&titles={article_name}&exlimit=1&explaintext"
    
    response = requests.get(url)
    data = response.json()
    
    page_id = list(data['query']['pages'].keys())[0]
    
    if 'extract' in data['query']['pages'][page_id]:
        article_text = data['query']['pages'][page_id]['extract']
        article_length = len(article_text)
        return article_length
    
    return None


def get_interwiki(article_name, language='en', n=3, include_article_language = False):
    """Creates a list of n interwikis for a given article which have the most developed articles"""
    url = f"https://{language}.wikipedia.org/w/api.php?action=query&prop=langlinks&format=json&titles={article_name}&lllimit=500"
    
    response = requests.get(url)
    data = response.json()
    interwiki = []
    if include_article_language:
        interwiki = [{'language': language, 'article': article_name, 
                      'length': get_article_length(article_name, language)}]
    page_id = list(data['query']['pages'].keys())[0]
        
    if 'langlinks' in data['query']['pages'][page_id]:
        langlinks = data['query']['pages'][page_id]['langlinks']
        for link in langlinks:
            interwiki.append({"language": link['lang'], "article": link['*'],
                              "length": get_article_length(link['*'], link['lang'])})
            
    return  sorted(interwiki, key=lambda x: x['length'], reverse=True)[:n]  # only n iw with longest articles


def fakta(article_title, language="cs", n=3):
    """Pro daný článek vypíše fakta z n nejrozsáhlejších interwiki"""
    interwiki = get_interwiki(article_title, language, n)
    fakta = []
    for iw in interwiki:
        fakta += list_facts(iw['article'], iw['language'])
    return fakta


def dictlist_to_wikitable(dict_list, unsourced=True, drop_columns=[]):
    """
    Converts a list of dictionaries to a Wikimedia wiki table markup.
    
    Args:
        dict_list (list): A list of dictionaries, all with the same structure.
        unsourced: Whether to keep unsourced facts (column Reference is empty)
        drop_columns: list of columns to drop from the output table
        
    Returns:
        str: The Wikimedia wiki table markup.
    """
    # Get the keys from the first dictionary
    keys = list(dict_list[0].keys())
    keys = [key for key in keys if key not in drop_columns]  # drop those not reported
    
    # Start the table markup
    table = "{| class=\"wikitable sortable\"\n"
    
    # Add the header row
    table += "! " + " !! ".join(keys) + "\n"
    
    # Add the data rows
    for d in dict_list:
        row = "|-\n| "
        for k in keys:
            value = str(d[k])
            if k.lower() == "reference" and len(value) > 0:
                value = "<nowiki>{}</nowiki>".format(value)
            if not k in drop_columns:    
                row += value + " || "
        if unsourced or len(d["Reference"]) > 0:       
            table += row[:-4] + "\n"  # Remove the last " || "
    
    # Close the table markup
    table += "|}"
    
    return table


f = fakta(article_title, language, n)  

# Print out the facts as a wikitable 
drops=["Fact", "Chapter_number", "Fact_number", "Chapter_length", "Article_title"]  # columns to drop
if  output_all_columns:  # output_all_columns = zda se mají vypsat všechny sloupce 
    drops=[]
print(dictlist_to_wikitable(f, unsourced=output_unsourced_facts, drop_columns=drops))