More than 5 years have passed since last update.

PythonとGoでxmlのパース速度を比較してみる

Last updated at 2017-07-05Posted at 2017-05-05

日々 http://manga-now.com で xml をパースしているのだけど、Python の実装を Go に変えたら速くなるのか比較してみました。なおパースの速度だけ比較したいので xml がメモリに読み込まれた状態から各要素を取得し終わるまでの速度を計測しています。

xml のダウンロード

まず Amazon Product Advertising API を使って書籍情報の xml を落としてファイルに保存しておきます。

get_books_xml.go

$ mkdir xmls
$ go run get_books_xml.go

AccessKey, SecretKey, AssociateTag を適当なものに変更して実行すると xmls ディレクトリに 145個のファイルが保存されます。1つのファイルには10冊までの情報が含まれ、合計1442冊の情報になります。

Python で実行

parse_amazon_xml.py

# -*- coding:utf-8 -*-
import time
from lxml import objectify


class ImageInfo:
	def __init__(self):
		self.url = ''
		self.width = ''
		self.height = ''

class BookInfo:
	def __init__(self):
		self.asin = ''
		self.title = ''
		self.binding = ''
		self.author = ''
		self.publisher = ''
		self.publicationDate = ''
		self.images = {}


def getText(dom, tag):
	return getattr(dom, tag).text if tag in dom else ''


def parseXmls(xmls):
	bookInfos = []
	for xml in xmls:
		dom = objectify.fromstring(xml)
		for item in dom.Items.Item:
			bookInfo = BookInfo()
			bookInfo.asin = item.ASIN.text

			attr = item.ItemAttributes
			bookInfo.title = getText(attr, 'Title')
			bookInfo.binding = getText(attr, 'Binding')
			bookInfo.author = getText(attr, 'Author')
			bookInfo.publisher = getText(attr, 'Publisher')
			bookInfo.publicationDate = getText(attr, 'PublicationDate')

			imageLabels = ['SmallImage', 'MediumImage', 'LargeImage']
			for imageLabel in imageLabels:
				image = ImageInfo()
				if imageLabel in item:
					image.url = getattr(item, imageLabel).URL.text
					image.width = int(getattr(item, imageLabel).Width.text)
					image.height = int(getattr(item, imageLabel).Height.text)
				bookInfo.images[imageLabel] = image

			bookInfos.append(bookInfo)

	return bookInfos


def getXmls():
	xmls = []
	for i in range(0, 1440+1, 10):
		path = 'xmls/{}.xml'.format(i)
		with open(path, 'r') as f:
			xml = f.read()
			xmls.append(xml)
	return xmls


def main():
	xmls = getXmls()
	start = time.time()
	bookInfos = parseXmls(xmls)
	end = time.time()
	print('xml数: {}'.format(len(xmls)))
	print('book数: {}'.format(len(bookInfos)))
	print('parse時間: {}秒'.format(end - start))


if __name__ == '__main__':
    main()

$ python parse_amazon_xml.py
xml数: 145
book数: 1442
parse時間: 0.14079904556274414秒

0.140秒でした。パースには lxml モジュールを使用しています。

Go で実行

parse_amazon_xml.go

package main

import (
	"fmt"
	"github.com/PuerkitoBio/goquery"
	"io/ioutil"
	"strconv"
	"strings"
	"time"
)

type ImageInfo struct {
	url    string
	width  int
	height int
}

type BookInfo struct {
	asin            string
	title           string
	binding         string
	author          string
	publisher       string
	publicationDate string
	images          map[string]ImageInfo
}

func parseXmls(xmls []string) []BookInfo {
	bookInfos := []BookInfo{}
	for _, xml := range xmls {
		dom, _ := goquery.NewDocumentFromReader(strings.NewReader(xml))
		dom.Find("Item").Each(func(_ int, item *goquery.Selection) {
			bookInfo := BookInfo{}
			bookInfo.asin = item.Find("ASIN").Text()
			attributes := item.Find("ItemAttributes").First()
			if attributes.Length() > 0 {
				bookInfo.title = attributes.Find("Title").Text()
				bookInfo.binding = attributes.Find("Binding").Text()
				bookInfo.author = attributes.Find("Author").Text()
				bookInfo.publisher = attributes.Find("Publisher").Text()
				bookInfo.publicationDate = attributes.Find("PublicationDate").Text()
			}
			imageLabels := []string{
				"SmallImage",
				"MediumImage",
				"LargeImage",
			}
			images := map[string]ImageInfo{}
			for _, imageLabel := range imageLabels {
				xml := item.Find(imageLabel).First()
				url := xml.Find("URL").Text()
				width, _ := strconv.Atoi(xml.Find("Height").Text())
				height, _ := strconv.Atoi(xml.Find("Width").Text())
				image := ImageInfo{url, width, height}
				images[imageLabel] = image
			}
			bookInfo.images = images
			bookInfos = append(bookInfos, bookInfo)
		})
	}
	return bookInfos
}

func getXmls() []string {
	xmls := []string{}
	for i := 0; i <= 1440; i += 10 {
		path := fmt.Sprintf("xmls/%d.xml", i)
		xml, _ := ioutil.ReadFile(path)
		xmls = append(xmls, string(xml))
	}
	return xmls
}

func main() {
	xmls := getXmls()
	start := time.Now()
	bookInfos := parseXmls(xmls)
	end := time.Now()
	fmt.Printf("xml数: %d\n", len(xmls))
	fmt.Printf("book数: %d\n", len(bookInfos))
	fmt.Printf("parse時間: %f秒\n", (end.Sub(start)).Seconds())
}

$ go run parse_amazon_xml.go
xml数: 145
book数: 1442
parse時間: 0.180461秒

0.18秒。Python より遅いですね。パースには goquery を使っています。

Go で並列実行

シングルスレッドだと Go の方が遅いけど、Go なら並列実行が簡単に行えるのでこちらも比較してみます。実行しているCPUは2コア4スレッドです。コードの変更箇所だけ書きます。

parse_amazon_xml_th.go

// 引数にチャンネルを取る
// 戻り値を削除
func parseXmls(result chan []BookInfo, xmls []string) {
	...同じなので省略
	// 処理結果をチャンネルに返す（returnを置き換えた）
	result <- bookInfos
}

// xml の配列を num に分割
func divideXmls(xmls []string, num int) [][]string {
	xmlsNum := len(xmls)
	size := xmlsNum / num
	result := [][]string{}
	for i := 0; i < num; i++ {
		start := size * i
		end := size * (i + 1)
		if i == (num - 1) {
			end = xmlsNum
		}
		result = append(result, xmls[start:end])
	}
	return result
}

func main() {
	allXmls := getXmls()
	// xml を4つに分割する
	divXmls := divideXmls(allXmls, 4)
	start := time.Now()

	result := make(chan []BookInfo)
	// ４スレッドで実行する
	for _, xmls := range divXmls {
		go parseXmls(result, xmls)
	}
	// チャンネルから処理結果を受取り１つにまとめる
	bookInfos := []BookInfo{}
	for _, _ = range divXmls {
		bookInfos = append(bookInfos, <-result...)
	}

	end := time.Now()
	fmt.Printf("xml数: %d\n", len(allXmls))
	fmt.Printf("book数: %d\n", len(bookInfos))
	fmt.Printf("parse時間: %f秒\n", (end.Sub(start)).Seconds())
}

$ go run parse_amazon_xml_th.go
xml数: 145
book数: 1442
parse時間: 0.084918秒

0.084秒。2倍くらいになった。

まとめ

実装	速度
Python (lxml)	0.140秒
Go (goquery) 1スレッド	0.180秒
Go (goquery) 4スレッド	0.084秒

並列実行してこその Go （並列実行しないと Go のメリットはない）

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up