在學(xué)習(xí)數(shù)據(jù)挖掘的時(shí)候小編發(fā)現(xiàn)有些數(shù)據(jù)集是使用xml文件來(lái)進(jìn)行數(shù)據(jù)管理的,而要對(duì)這樣的數(shù)據(jù)集進(jìn)行處理需要使用到python的XML支持,接下來(lái)的這篇文章我們就介紹一下這三個(gè)處理XML需要的知識(shí)點(diǎn):python統(tǒng)計(jì)xml標(biāo)簽數(shù)量;python修改xml標(biāo)簽內(nèi)容;python獲取xml數(shù)據(jù)。
1. 根據(jù)xml文件統(tǒng)計(jì)目標(biāo)種類以及數(shù)量
# -*- coding:utf-8 -*-
#根據(jù)xml文件統(tǒng)計(jì)目標(biāo)種類以及數(shù)量
import os
import xml.etree.ElementTree as ET
import numpy as np
np.set_printoptions(suppress=True, threshold=np.nan)
import matplotlib
from PIL import Image
def parse_obj(xml_path, filename):
tree=ET.parse(xml_path+filename)
objects=[]
for obj in tree.findall('object'):
obj_struct={}
obj_struct['name']=obj.find('name').text
objects.append(obj_struct)
return objects
def read_image(image_path, filename):
im=Image.open(image_path+filename)
W=im.size[0]
H=im.size[1]
area=W*H
im_info=[W,H,area]
return im_info
if __name__ == '__main__':
xml_path='/home/dlut/網(wǎng)絡(luò)/make_database/數(shù)據(jù)集——合集/VOCdevkit/VOC2018/Annotations/'
filenamess=os.listdir(xml_path)
filenames=[]
for name in filenamess:
name=name.replace('.xml','')
filenames.append(name)
recs={}
obs_shape={}
classnames=[]
num_objs={}
obj_avg={}
for i,name in enumerate(filenames):
recs[name]=parse_obj(xml_path, name+ '.xml' )
for name in filenames:
for object in recs[name]:
if object['name'] not in num_objs.keys():
num_objs[object['name']]=1
else:
num_objs[object['name']]+=1
if object['name'] not in classnames:
classnames.append(object['name'])
for name in classnames:
print('{}:{}個(gè)'.format(name,num_objs[name]))
print('信息統(tǒng)計(jì)算完畢。')
2.根據(jù)xml文件統(tǒng)計(jì)目標(biāo)的平均長(zhǎng)度、寬度、面積以及每一個(gè)目標(biāo)在原圖中的占比
# -*- coding:utf-8 -*-
#統(tǒng)計(jì)
# 計(jì)算每一個(gè)目標(biāo)在原圖中的占比
# 計(jì)算目標(biāo)的平均長(zhǎng)度、
# 計(jì)算平均寬度,
# 計(jì)算平均面積、
# 計(jì)算目標(biāo)平均占比
import os
import xml.etree.ElementTree as ET
import numpy as np
#np.set_printoptions(suppress=True, threshold=np.nan) #10,000,000
np.set_printoptions(suppress=True, threshold=10000000) #10,000,000
import matplotlib
from PIL import Image
def parse_obj(xml_path, filename):
tree = ET.parse(xml_path + filename)
objects = []
for obj in tree.findall('object'):
obj_struct = {}
obj_struct['name'] = obj.find('name').text
bbox = obj.find('bndbox')
obj_struct['bbox'] = [int(bbox.find('xmin').text),
int(bbox.find('ymin').text),
int(bbox.find('xmax').text),
int(bbox.find('ymax').text)]
objects.append(obj_struct)
return objects
def read_image(image_path, filename):
im = Image.open(image_path + filename)
W = im.size[0]
H = im.size[1]
area = W * H
im_info = [W, H, area]
return im_info
if __name__ == '__main__':
image_path = '/home/dlut/網(wǎng)絡(luò)/make_database/數(shù)據(jù)集——合集/VOCdevkit/VOC2018/JPEGImages/'
xml_path = '/home/dlut/網(wǎng)絡(luò)/make_database/數(shù)據(jù)集——合集/VOCdevkit/VOC2018/Annotations/'
filenamess = os.listdir(xml_path)
filenames = []
for name in filenamess:
name = name.replace('.xml', '')
filenames.append(name)
print(filenames)
recs = {}
ims_info = {}
obs_shape = {}
classnames = []
num_objs={}
obj_avg = {}
for i, name in enumerate(filenames):
print('正在處理 {}.xml '.format(name))
recs[name] = parse_obj(xml_path, name + '.xml')
print('正在處理 {}.jpg '.format(name))
ims_info[name] = read_image(image_path, name + '.jpg')
print('所有信息收集完畢。')
print('正在處理信息......')
for name in filenames:
im_w = ims_info[name][0]
im_h = ims_info[name][1]
im_area = ims_info[name][2]
for object in recs[name]:
if object['name'] not in num_objs.keys():
num_objs[object['name']] = 1
else:
num_objs[object['name']] += 1
#num_objs += 1
ob_w = object['bbox'][2] - object['bbox'][0]
ob_h = object['bbox'][3] - object['bbox'][1]
ob_area = ob_w * ob_h
w_rate = ob_w / im_w
h_rate = ob_h / im_h
area_rate = ob_area / im_area
if not object['name'] in obs_shape.keys():
obs_shape[object['name']] = ([[ob_w,
ob_h,
ob_area,
w_rate,
h_rate,
area_rate]])
else:
obs_shape[object['name']].append([ob_w,
ob_h,
ob_area,
w_rate,
h_rate,
area_rate])
if object['name'] not in classnames:
classnames.append(object['name']) # 求平均
for name in classnames:
obj_avg[name] = (np.array(obs_shape[name]).sum(axis=0)) / num_objs[name]
print('{}的情況如下:*******
'.format(name))
print(' 目標(biāo)平均W={}'.format(obj_avg[name][0]))
print(' 目標(biāo)平均H={}'.format(obj_avg[name][1]))
print(' 目標(biāo)平均area={}'.format(obj_avg[name][2]))
print(' 目標(biāo)平均與原圖的W比例={}'.format(obj_avg[name][3]))
print(' 目標(biāo)平均與原圖的H比例={}'.format(obj_avg[name][4]))
print(' 目標(biāo)平均原圖面積占比={}
'.format(obj_avg[name][5]))
print('信息統(tǒng)計(jì)計(jì)算完畢。')
3.修改xml文件中某個(gè)目標(biāo)的名字為另一個(gè)名字
#修改xml文件中的目標(biāo)的名字,
import os, sys
import glob
from xml.etree import ElementTree as ET
# 批量讀取Annotations下的xml文件
# per=ET.parse(r'C:Users
ockhuangDesktopAnnotations