Open
Description
Hello everyone.
I need to extract numbering and multilevel lists for my work, so I wrote the following extraction code:
def get_qn_name(tag_name):
return docx.oxml.ns.qn(tag_name)
def get_numbering_part(doc):
numbering_part_element = doc.part.numbering_part.element
w_val = get_qn_name('w:val')
# 1. Locate all w:num elements. Each w:num element contains a numId attribute that associates it with a paragraph.
w_num = get_qn_name('w:num')
num_elements = numbering_part_element.findall(w_num)
# 2. Locate all abstractNumId elements. Typically, there is one abstractNumId element.
absNumId_to_numId = {}
w_abstractNumId = get_qn_name('w:abstractNumId')
w_numId = get_qn_name('w:numId')
for num_element in num_elements:
abstractNumId = num_element.findall(w_abstractNumId)
if len(abstractNumId) == 0:
continue
abstractNumId = abstractNumId[0]
abstractNumId = abstractNumId.get(w_val)
numId = num_element.get(w_numId)
if abstractNumId is not None and numId is not None:
absNumId_to_numId.update({abstractNumId: numId})
# 3. Locate all abstractNum elements.
w_abstractNum = get_qn_name('w:abstractNum')
abstractNum_elements = numbering_part_element.findall(w_abstractNum)
# 4. Within each abstractNum element, examine the abstractNumId, lvl, lvlText, and numFmt elements.
# Under normal circumstances, there would be only one lvlText and one numFmt element for each level.
w_lvl = get_qn_name('w:lvl')
w_ilvl = get_qn_name('w:ilvl')
w_lvlText = get_qn_name('w:lvlText')
w_numFmt = get_qn_name('w:numFmt')
numbering_part = {}
for abstractNum_element in abstractNum_elements:
abstractNumId = abstractNum_element.get(w_abstractNumId)
if abstractNumId is None:
continue
bucket = {}
lvl_elements = abstractNum_element.findall(w_lvl)
numFmt = 'decimal'
for lvl_element in lvl_elements:
ilvl = lvl_element.get(w_ilvl)
if ilvl is None:
continue
lvlText_elements = lvl_element.findall(w_lvlText)
numFmt_elements = lvl_element.findall(w_numFmt)
if len(lvlText_elements) == 0 or len(numFmt_elements) == 0:
continue
numFmt = numFmt_elements[0].get(w_val)
text = lvlText_elements[0].get(w_val)
bucket.update({ilvl:[text, numFmt]})
if abstractNumId in absNumId_to_numId.keys():
numbering_part.update({
absNumId_to_numId[abstractNumId]: bucket
})
return numbering_part
def get_known_formats():
def generate_roman_numerals(limit = 100, uppercase = True):
roman_map = { 1: 'I', 4: 'IV', 5: 'V', 9: 'IX', 10: 'X', 40: 'XL', 50: 'L', 90: 'XC', 100: 'C'}
roman_numerals = []
for i in range(1, limit + 1):
result = ""
for value, numeral in sorted(roman_map.items(), reverse=True):
while i >= value:
result += numeral
i -= value
result = result if uppercase else result.lower()
roman_numerals.append(result)
return roman_numerals
ENG_LETTERS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
upperLetter = list(ENG_LETTERS) + [
f'{a}{b}'
for a in list(ENG_LETTERS) for b in list(ENG_LETTERS)
]
lowerLetter = list(ENG_LETTERS.lower()) + [
f'{a}{b}'
for a in list(ENG_LETTERS.lower()) for b in list(ENG_LETTERS.lower())
]
upperRoman = generate_roman_numerals()
lowerRoman = generate_roman_numerals(uppercase = False)
return {
'upperLetter': upperLetter,
'lowerLetter': lowerLetter,
'upperRoman': upperRoman,
'lowerRoman': lowerRoman
}
def get_string_for_format(format, stack_number, known_formats):
if format in known_formats.keys():
if len(known_formats[format]) > stack_number:
return known_formats[format][stack_number]
return stack_number+1
def apply_numbering(numId_val, ilvl_val, numbering_part, numbering_part_stack, known_formats):
numbering = numbering_part[numId_val]
if not numId_val in numbering_part_stack.keys():
numbering_part_stack.update({numId_val: {}})
number_format, format = numbering[ilvl_val]
ilvl_val = int(ilvl_val)
if not ilvl_val in numbering_part_stack[numId_val].keys():
numbering_part_stack[numId_val].update({ilvl_val: 0})
else:
for drop in range(max(numbering_part_stack[numId_val].keys())-ilvl_val):
drop = ilvl_val+drop+1
if drop in numbering_part_stack[numId_val].keys():
numbering_part_stack[numId_val].pop(drop)
numbering_part_stack[numId_val][ilvl_val] += 1
search = re.findall(r'(\%\d+)', number_format)
if len(search) == 0:
return number_format
number_format = re.sub(r'(\%\d+)', '{}', number_format)
format_letters = []
for _ in range(len(search)):
stack_number = numbering_part_stack[numId_val][ilvl_val]
_, format = numbering[str(ilvl_val)]
letter = get_string_for_format(format, stack_number, known_formats)
format_letters.append(letter)
ilvl_val -= 1
return number_format.format(*format_letters[::-1]), numbering_part_stack
def get_ppr_val(para):
if not isinstance(para, docx.oxml.text.paragraph.CT_P):
return None, None
ilvl_val = None
numId_val = None
val_name = docx.oxml.ns.qn('w:val')
ppr = para.pPr
if ppr is not None:
numpr = ppr.numPr
if numpr is not None:
numId = numpr.numId
if numId is not None:
numId_val = numId.get(val_name)
ilvl = numpr.ilvl
if ilvl is not None:
ilvl_val = ilvl.get(val_name)
return numId_val, ilvl_val
numbering_part = get_numbering_part(doc)
known_formats = get_known_formats()
numbering_part_stack = {}
for i, element in enumerate(doc.element.body):
text = element.text
numId_val, ilvl_val = get_ppr_val(element)
if not text is None:
print(i)
print('-'*50)
if not numId_val is None:
prefix, numbering_part_stack = apply_numbering(
numId_val, ilvl_val,
numbering_part, numbering_part_stack, known_formats)
print(f'{prefix} {text}')
else:
print(text)
print('-'*50)
print()
0
--------------------------------------------------
Numbering
--------------------------------------------------
1
--------------------------------------------------
1. 1
--------------------------------------------------
2
--------------------------------------------------
A. 1-A
--------------------------------------------------
3
--------------------------------------------------
i. 1-A-Roman 1
--------------------------------------------------
4
--------------------------------------------------
ii. 1-A-Roman 2
--------------------------------------------------
5
--------------------------------------------------
B. 1-B
--------------------------------------------------
6
--------------------------------------------------
2. 2
--------------------------------------------------
7
--------------------------------------------------
3. 3
--------------------------------------------------
8
--------------------------------------------------
4. 4
--------------------------------------------------
9
--------------------------------------------------
Step1. step1
--------------------------------------------------
10
--------------------------------------------------
Step2. step2
--------------------------------------------------
11
--------------------------------------------------
Step3. step3
--------------------------------------------------
12
--------------------------------------------------
1 Circle 1
--------------------------------------------------
13
--------------------------------------------------
A. Circle 1-A
--------------------------------------------------
14
--------------------------------------------------
B. Circle 1-B
--------------------------------------------------
15
--------------------------------------------------
i. Circle 1-B-Roman 1
--------------------------------------------------
16
--------------------------------------------------
ii. Circle 1-B-Roman 2
--------------------------------------------------
17
--------------------------------------------------
C. Circle 1-C
--------------------------------------------------
18
--------------------------------------------------
2 Circle 2
--------------------------------------------------
19
--------------------------------------------------
3 Circle 3
--------------------------------------------------
20
--------------------------------------------------
Multi-Level
--------------------------------------------------
21
--------------------------------------------------
1. 1
--------------------------------------------------
22
--------------------------------------------------
1.1. 1-1
--------------------------------------------------
23
--------------------------------------------------
1.1.1. 1-1-1
--------------------------------------------------
24
--------------------------------------------------
1.1.2. 1-1--2
--------------------------------------------------
25
--------------------------------------------------
1.2. 1-2
--------------------------------------------------
26
--------------------------------------------------
1.2.1. 1-2-1
--------------------------------------------------
27
--------------------------------------------------
1.3. 1-3
--------------------------------------------------
28
--------------------------------------------------
2. 2
--------------------------------------------------
29
--------------------------------------------------
3. 3
--------------------------------------------------
If you have any alternative approaches or improvements (such as supporting more formats in "known_formats"),
please share them with us. Thank you.
Metadata
Metadata
Assignees
Labels
No labels