Skip to content

Extraction of numbering and multi-level sign #1472

Open
@lottopotato

Description

@lottopotato

Hello everyone.

I need to extract numbering and multilevel lists for my work, so I wrote the following extraction code:

def get_qn_name(tag_name):
    return docx.oxml.ns.qn(tag_name)

def get_numbering_part(doc):
    numbering_part_element = doc.part.numbering_part.element
    w_val = get_qn_name('w:val')
    # 1. Locate all w:num elements. Each w:num element contains a numId attribute that associates it with a paragraph.
    w_num = get_qn_name('w:num')
    num_elements = numbering_part_element.findall(w_num)
    
    # 2. Locate all abstractNumId elements. Typically, there is one abstractNumId element.
    absNumId_to_numId = {}
    w_abstractNumId = get_qn_name('w:abstractNumId')
    w_numId = get_qn_name('w:numId')
    for num_element in num_elements:
        abstractNumId = num_element.findall(w_abstractNumId)
        if len(abstractNumId) == 0:
            continue
        abstractNumId = abstractNumId[0]
        abstractNumId = abstractNumId.get(w_val)
        numId = num_element.get(w_numId)
        if abstractNumId is not None and numId is not None:
            absNumId_to_numId.update({abstractNumId: numId})
    
    # 3. Locate all abstractNum elements.
    w_abstractNum = get_qn_name('w:abstractNum')
    abstractNum_elements = numbering_part_element.findall(w_abstractNum)
    
    # 4. Within each abstractNum element, examine the abstractNumId, lvl, lvlText, and numFmt elements. 
    # Under normal circumstances, there would be only one lvlText and one numFmt element for each level.
    w_lvl = get_qn_name('w:lvl')
    w_ilvl = get_qn_name('w:ilvl')
    w_lvlText = get_qn_name('w:lvlText')
    w_numFmt = get_qn_name('w:numFmt')
    
    numbering_part = {}
    for abstractNum_element in abstractNum_elements:
        abstractNumId = abstractNum_element.get(w_abstractNumId)
        if abstractNumId is None:
            continue
        bucket = {}
        lvl_elements = abstractNum_element.findall(w_lvl)
        numFmt = 'decimal'
        for lvl_element in lvl_elements:
            ilvl = lvl_element.get(w_ilvl)
            if ilvl is None:
                continue
                
            lvlText_elements = lvl_element.findall(w_lvlText)
            numFmt_elements = lvl_element.findall(w_numFmt)
            if len(lvlText_elements) == 0 or len(numFmt_elements) == 0:
                continue
            numFmt = numFmt_elements[0].get(w_val)
            text = lvlText_elements[0].get(w_val)
            bucket.update({ilvl:[text, numFmt]})
            
        if abstractNumId in absNumId_to_numId.keys():
            numbering_part.update({
                absNumId_to_numId[abstractNumId]: bucket
            })
    return numbering_part

def get_known_formats():
    def generate_roman_numerals(limit = 100, uppercase = True):
        roman_map = { 1: 'I', 4: 'IV', 5: 'V', 9: 'IX', 10: 'X', 40: 'XL', 50: 'L', 90: 'XC', 100: 'C'}

        roman_numerals = []
        for i in range(1, limit + 1):
            result = ""
            for value, numeral in sorted(roman_map.items(), reverse=True):
                while i >= value:
                    result += numeral
                    i -= value
            result = result if uppercase else result.lower()
            roman_numerals.append(result)
        return roman_numerals
    
    ENG_LETTERS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    upperLetter = list(ENG_LETTERS) + [
        f'{a}{b}'
        for a in list(ENG_LETTERS) for b in list(ENG_LETTERS)   
    ]
    lowerLetter = list(ENG_LETTERS.lower()) + [
        f'{a}{b}'
        for a in list(ENG_LETTERS.lower()) for b in list(ENG_LETTERS.lower())
    ]
    upperRoman = generate_roman_numerals()
    lowerRoman = generate_roman_numerals(uppercase = False)
    return {
        'upperLetter': upperLetter,
        'lowerLetter': lowerLetter,
        'upperRoman': upperRoman,
        'lowerRoman': lowerRoman
    }

def get_string_for_format(format, stack_number, known_formats):
    if format in known_formats.keys():
        if len(known_formats[format]) > stack_number:
            return known_formats[format][stack_number]
    return stack_number+1

def apply_numbering(numId_val, ilvl_val, numbering_part, numbering_part_stack, known_formats):
    numbering = numbering_part[numId_val]
    if not numId_val in numbering_part_stack.keys():
        numbering_part_stack.update({numId_val: {}})
        
    number_format, format = numbering[ilvl_val]
    ilvl_val = int(ilvl_val)
    if not ilvl_val in numbering_part_stack[numId_val].keys():
        numbering_part_stack[numId_val].update({ilvl_val: 0})
    else:
        for drop in range(max(numbering_part_stack[numId_val].keys())-ilvl_val):
            drop = ilvl_val+drop+1
            if drop in numbering_part_stack[numId_val].keys():
                numbering_part_stack[numId_val].pop(drop)
        numbering_part_stack[numId_val][ilvl_val] += 1
    search = re.findall(r'(\%\d+)', number_format)
    if len(search) == 0:
        return number_format
    number_format = re.sub(r'(\%\d+)', '{}', number_format)
    
    format_letters = []
    for _ in range(len(search)):
        stack_number = numbering_part_stack[numId_val][ilvl_val]
        _, format = numbering[str(ilvl_val)]
        letter = get_string_for_format(format, stack_number, known_formats)
        format_letters.append(letter)
        ilvl_val -= 1
    
    return number_format.format(*format_letters[::-1]), numbering_part_stack

def get_ppr_val(para):
    if not isinstance(para, docx.oxml.text.paragraph.CT_P):
        return None, None
    ilvl_val = None
    numId_val = None
    val_name = docx.oxml.ns.qn('w:val')
    ppr = para.pPr
    if ppr is not None:
        numpr = ppr.numPr
        if numpr is not None:
            numId = numpr.numId
            if numId is not None:
                numId_val = numId.get(val_name)    
            ilvl = numpr.ilvl
            if ilvl is not None:
                ilvl_val = ilvl.get(val_name)
            
    return numId_val, ilvl_val

Image
Image

numbering_part = get_numbering_part(doc)
known_formats = get_known_formats()
numbering_part_stack = {}
for i, element in  enumerate(doc.element.body):
    text = element.text
    numId_val, ilvl_val = get_ppr_val(element)
    if not text is None:
        print(i)
        print('-'*50)
        if not numId_val is None:
            prefix, numbering_part_stack = apply_numbering(
                numId_val, ilvl_val, 
                numbering_part, numbering_part_stack, known_formats)
            print(f'{prefix} {text}')
        else:
            print(text)
        print('-'*50)
        print()
0
--------------------------------------------------
Numbering
--------------------------------------------------

1
--------------------------------------------------
1. 1
--------------------------------------------------

2
--------------------------------------------------
A. 1-A
--------------------------------------------------

3
--------------------------------------------------
i. 1-A-Roman 1
--------------------------------------------------

4
--------------------------------------------------
ii. 1-A-Roman 2
--------------------------------------------------

5
--------------------------------------------------
B. 1-B
--------------------------------------------------

6
--------------------------------------------------
2. 2
--------------------------------------------------

7
--------------------------------------------------
3. 3
--------------------------------------------------

8
--------------------------------------------------
4. 4
--------------------------------------------------

9
--------------------------------------------------
Step1. step1
--------------------------------------------------

10
--------------------------------------------------
Step2. step2
--------------------------------------------------

11
--------------------------------------------------
Step3. step3
--------------------------------------------------

12
--------------------------------------------------
1 Circle 1
--------------------------------------------------

13
--------------------------------------------------
A. Circle 1-A
--------------------------------------------------

14
--------------------------------------------------
B. Circle 1-B
--------------------------------------------------

15
--------------------------------------------------
i. Circle 1-B-Roman 1
--------------------------------------------------

16
--------------------------------------------------
ii. Circle 1-B-Roman 2
--------------------------------------------------

17
--------------------------------------------------
C. Circle 1-C
--------------------------------------------------

18
--------------------------------------------------
2 Circle 2
--------------------------------------------------

19
--------------------------------------------------
3 Circle 3
--------------------------------------------------

20
--------------------------------------------------
Multi-Level
--------------------------------------------------

21
--------------------------------------------------
1. 1
--------------------------------------------------

22
--------------------------------------------------
1.1. 1-1
--------------------------------------------------

23
--------------------------------------------------
1.1.1. 1-1-1
--------------------------------------------------

24
--------------------------------------------------
1.1.2. 1-1--2
--------------------------------------------------

25
--------------------------------------------------
1.2. 1-2
--------------------------------------------------

26
--------------------------------------------------
1.2.1. 1-2-1
--------------------------------------------------

27
--------------------------------------------------
1.3. 1-3
--------------------------------------------------

28
--------------------------------------------------
2. 2
--------------------------------------------------

29
--------------------------------------------------
3. 3
--------------------------------------------------

If you have any alternative approaches or improvements (such as supporting more formats in "known_formats"),
please share them with us. Thank you.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions