Bikarhêner:Balyozxane/becure.py

import pywikibot
import re

# Function to get page titles from a file
def get_page_titles_from_file(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        return [line.strip() for line in file]
 
# Function to determine if a page should be skipped
def should_skip_page(content):
    # Define a regular expression pattern to match unwanted content
    pattern = r'==[ ]*?\{\{ziman\|(tr|en|ar|fa|de)\}\}[ ]*?==\n?\n?===[ ]*?Mane[ ]*?===\n?\n?#.*\n?\n?'

    match = re.search(pattern, content)

    if match and content.strip() == match.group(0):
        return match.group(1), False
    else:
        return None, True

# Function to check if a page belongs to a single category
def is_page_in_single_category(page_title, categories_to_check):
    site = pywikibot.Site("en", "wiktionary")
    page = pywikibot.Page(site, page_title)

    page_categories = [cat.title(with_ns=False) for cat in page.categories()]
    matching_categories = []

    for category in categories_to_check:
        if category in page_categories:
            matching_categories.append(category)

    print(f"matching_categories: {matching_categories}")
    if len(matching_categories) == 1:
        return matching_categories[0]
    else:
        return None

# Function to extract category type and language
def extract_cat_type_and_lang(cleared_category):
    match = re.match(rf"^(English|Turkish|German|Arabic|Persian) (.+)$", cleared_category)
    if match:
        return match.group(1), match.group(2)
    return None, None
    
def get_gender(page_title):

    # Define category types
    gender_types = {
        "feminine nouns": "m",
        "masculine nouns": "n",
        "neuter nouns": "nt",
        "nouns with multiple genders": "mn"
    }
    categories_to_check = [f"German {category}" for category in gender_types.keys()]
    print(f"gender_cats {categories_to_check}")
    
    cleared_category = is_page_in_single_category(page_title, categories_to_check)
    if cleared_category:
        lang, gender = extract_cat_type_and_lang(cleared_category)
        ku_gender = gender_types[gender]
        print(f"ku_gender {ku_gender}")
        return ku_gender
    return None    
    
# Function to update page content
def update_page_content(page_title, cat_type, lang_code):
    site = pywikibot.Site("ku", "wiktionary")
    page = pywikibot.Page(site, page_title)
    page_text = page.text
    
    if cat_type == "Gotinên pêşiyan":
        cat_type = "Gotineke pêşiyan"

    # Define the regex pattern to be replaced
    pattern = r'=?=?\s?\{?\{?z?i?m?a?n?\|?(tr|en|ar|fa|de)?\}?\}?\s*?=?=?\n?\n?===[ ]*?Mane[ ]*?===\n?\n?#'
    # Define the replacement text
    replacement = "== {{ziman|" + lang_code + "}} ==\n\n=== " + cat_type + " ===\n{{" + cat_type.lower() + "|" + lang_code

    if cat_type == "Navdêr" and lang_code == "de":
        gender = get_gender(page_title)
        if gender:
            replacement += "|z=" + gender
         
    replacement += "}}\n#"
    # Use re.sub to perform the replacement
    new_content = re.sub(pattern, replacement, page_text)

    page.text = new_content
    page.save("+Şablona cureyê rêzimanî (bi riya [[Bikarhêner:Balyozxane/becure.py|bêcure.py]])")

# Function to log skipped pages
def log_skipped_page(page_title):
    with open('skipped_pages_en.txt', 'a', encoding='utf-8') as file:
        file.write(page_title + '\n')

# Define language names and their corresponding codes
languages = {
    "Turkish": "tr",
    "English": "en",
    "Arabic": "ar",
    "Persian": "fa",
    "German": "de"
}

# Define category types
category_types = {
    "verbs": "Lêker",
    "nouns": "Navdêr",
    "adjectives": "Rengdêr",
    "proper nouns": "Serenav",
    "adverbs": "Hoker",
    "pronouns": "Cînav",
    "interjections": "Baneşan",
    "circumpositions": "Bazinedaçek",
    "conjunctions": "Girêdek",
    "proverbs": "Gotinên pêşiyan",
    "numerals": "Hejmar",
    "interfixes": "Navgir",
    "postpositions": "Paşdaçek",
    "suffixes": "Paşgir",
    "prepositions": "Pêşdaçek",
    "prefixes": "Pêşgir",
    "particles": "Pirtik",
    "roots": "Reh",
    "symbols": "Sembol",
    "letters": "Tîp"
}

# Step 1: Get list of page titles from file
pages = get_page_titles_from_file('list_bêcure.txt')

# Step 2 and 3: Process each page
for page_title in pages:
    content = pywikibot.Page(pywikibot.Site('ku', 'wiktionary'), page_title).text

    ku_lang_code, should_skip = should_skip_page(content)
    
    if should_skip:
        print(f"Skipping page {page_title} as it contains unexpected content")
        log_skipped_page(page_title)
    else:
        print(f"{page_title} not skipped. Matched ku_lang_code: {ku_lang_code}")
        # Create categories to check
        categories_to_check = [f"{lang_name} {category}" for lang_name, _ in languages.items() for category in category_types.keys()]

        cleared_category = is_page_in_single_category(page_title, categories_to_check)

        if cleared_category:
            lang_name, cat_type = extract_cat_type_and_lang(cleared_category)

            pywikibot.output(f"cat_type '{cat_type}', lang_name '{lang_name}'.")

            lang_code = languages[lang_name]
            ku_cat_type = category_types[cat_type]
            pywikibot.output(f"lang_code '{lang_code}'; ku_cat_type '{ku_cat_type}'.")
            
            if lang_code == ku_lang_code:
                update_page_content(page_title, ku_cat_type, lang_code)
                print(f"Updated page content for {page_title}")
            else:
                print(f"{page_title} lang_code is not equal to ku_lang_code skipping the page. ")
                log_skipped_page(page_title)       
        else: 
            print(f"{page_title} is not in the specified categories or matches multiple categories")
            log_skipped_page(page_title)

print("Finished processing all pages.")