Wikipedie:GPT vypisovač informace, která není obsažena v článku, ale je v interwiki

Z Wikipedie, otevřené encyklopedie

Následující program v Pythonu 3 je založený na umělé inteligenci. Zdáte krátky článek české Wikipedie, který má interwiki (jeho jméno zapište přímo do programu, jde o "holý" program bez uživatelského rozhraní). Program v bodech vypíše až dvanáct nejdůležitějších informací, které v interwiki nalezl a jež by mohly článek obohatit. Vyžaduje přístup k placeným API firmy OpenAI, což je potřeba si zakoupit na jejich webu.

import requests
import re
import time
from bs4 import BeautifulSoup
from openai import OpenAI
client = OpenAI(api_key=my_api_key)   # za my_api_key dosadit klíč od firmy OpenAI - lze ho zakoupit na jejich webu

# Vstupy:
article_name = "Dřevina"  # jméno článku v uvozovkách  
language = "cs"  # Wikipedie
n = 3    # s kolika články ho chceme porovnat 

def get_article_length(article_name, language='en'):
    """How long is a given article"""
    url = f"https://{language}.wikipedia.org/w/api.php?action=query&format=json&prop=extracts&titles={article_name}&exlimit=1&explaintext"
    
    response = requests.get(url)
    data = response.json()
    
    page_id = list(data['query']['pages'].keys())[0]
    
    if 'extract' in data['query']['pages'][page_id]:
        article_text = data['query']['pages'][page_id]['extract']
        article_length = len(article_text)
        return article_length
    
    return None


def get_interwiki(article_name, language='en', n=3, include_article_language = False):
    """Creates a list of n interwikis for a given article which have the most developed articles"""
    url = f"https://{language}.wikipedia.org/w/api.php?action=query&prop=langlinks&format=json&titles={article_name}&lllimit=500"
    
    response = requests.get(url)
    data = response.json()
    interwiki = []
    if include_article_language:
        interwiki = [{'language': language, 'article': article_name, 
                      'length': get_article_length(article_name, language)}]
    page_id = list(data['query']['pages'].keys())[0]
        
    if 'langlinks' in data['query']['pages'][page_id]:
        langlinks = data['query']['pages'][page_id]['langlinks']
        for link in langlinks:
            interwiki.append({"language": link['lang'], "article": link['*'],
                              "length": get_article_length(link['*'], link['lang'])})
            
    return  sorted(interwiki, key=lambda x: x['length'], reverse=True)[:n]  # only n iw with longest articles


def wiki_plain_text(article_name, language='en'):
    """List of texts of chapters of the article"""
    url = f"https://{language}.wikipedia.org/wiki/{article_name}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    for unwanted in soup(["table", "figure", "sup"]):  # remove unwanted tag elements
        unwanted.extract()                             

    paras = []    
    article_text = ''
    pridej = True
    for header in soup.find(id='mw-content-text').find_all(["h2", "h3", "h4", "p", "ul", "ol", "li"]):
        headertext = re.sub(r'\[.*?\]', '', header.text.strip())
        headertext = headertext.replace('\xa0', ' ')
        delka = len(headertext)
        if header.name == "p":
            article_text += re.sub('\n', ' ', headertext) + "\n"
            if delka > 50:
                pridej = True
        else:
            if pridej:
                paras.append(article_text)
            article_text = headertext+'\n'
            pridej = False
    if pridej:    
        paras.append(article_text)
    
    return  [item for item in paras if item.strip() != ""]  # Only items which are not empty


def raw_info(article_name, language='en', n=3):
    """List of texts of chapters of n most developed articles = the basis for the reasoning about the subject"""
    interwiki = get_interwiki(article_name, language, n)
    print("*** Source languages: ")
    for i in interwiki:
        print('  ' + i['language'])
    orig = wiki_plain_text(article_name, language)

    info = [orig[0]]  # Start with the basic definition in the original language
    
    for iw in interwiki:  # Add chapters of the selected interwikis
        info += wiki_plain_text(iw['article'], iw['language'])
    return info    
        

def info_bits(article_name, language='en', n=3):
    """Creates bits of information about the subject and evaluates their importance"""
    info = raw_info(article_name, language, n)
    print("*** Chapters read: "+ str(len(info) - 1))
    
    defined = info.pop(0)
        
    prompt = f"We will create list of fact about {article_name}, which Wikipedia describes as: {defined}" 
    prompt += f"Take the following text and extracts facts about {article_name} from it. "  
    prompt += "The output will have one fact on one line. There will be short statement of the fact, "
    prompt += f"semicolon, and evaluation of the importance of the fact for understanding of {article_name} "
    prompt += """on the scale 'Very important', 'Important', 'Less important', 'Unimportant'. 
              Example of one line of the output: \nThe king had no daughters. ; Unimportant \n """
    prompt += f"Write in the language of the original text. The text about {article_name} is: "
    
    facts = ''
    
    for chapter in info:   # Process chapters one by one
        
        response = client.chat.completions.create(
            model="gpt-3.5-turbo-16k",    # "gpt-3.5-turbo-16k" or "gpt-4-turbo"
            max_tokens=1200,
            messages=[
                {
                  "role": "user",
                  "content": prompt + chapter
                },]
              )
        msgtext = response.choices[0].message.content
        
        if msgtext is not None:
            facts += msgtext
            facts += "\n"
        else:
            print("**No message Prompt** "+ prompt + chapter)
            print("**No message Response** "+str(response))
        #time.sleep(1)
        
    return facts

    
def final_list(article_name, language='en', n=3):
    """Compares a Wikipedia article with the information extracted from other interwiki articles"""
    
    info = info_bits(article_name, language, n)   # info from the most developed articles
    
    plaintext = ' '.join(wiki_plain_text(article_name, language))     # the analyzed article 
    
    prompt = f"The goal is to prepare a list of important fact which are not mentioned in an article \
             about {article_name} and can enhance it.  \
             Therefore write the output in the language of the article, which is  {language}. \
             Start by comparing the article about {article_name} with a list of facts about the same subject. \
             The facts are followed by an importance estimation, but this estimation may be wrong. \
             The facts may be written in different languages, \
             but you will use {language} in the output. Here is **the article**:\n"
    prompt += plaintext
    prompt += "\nAnd here is **the list of facts**:\n"
    prompt += info
    prompt += f"\nSome of the facts may repeat in the list. Create a new numbered list od facts about \
             {article_name}, this time without repetition. Each fact will be on a separate line.  \
             Drop the importance estimation. Mention only those fact which are substantial and not already mentioned  \
             in the article. Mention at most twelve most important facts. Sort the facts in order if importance.  \
             Write in the language of the article, which is {language}. Drop all the facts which are contained  \
             in the article you have read first."

    response = client.chat.completions.create(
        model="gpt-4o",    # "gpt-3.5-turbo-16k" or "gpt-4-turbo"
        max_tokens=2200,
        messages=[
                {
                  "role": "user",
                  "content": prompt 
                },]
              )
    msgtext = response.choices[0].message.content
    return [msgtext, info, plaintext]
    
    
info = final_list(article_name, language, n)
print(info[0])