# -*- coding: utf-8 -*-
"""
index_static.py
- just fixed for pyblosxom-cmd staticrender export all .html links
base index.py from:
http://snarfed.org/space/pyblosxom+index
Ryan Barrett
This plugin displays an alphabetical index of all entries. It uses these
optional config variables from config.py, shown here with their defaults:
py['index_trigger'] = '/site-index'
py['index_num_columns'] = 2
py['index_letters_first'] = True
py['index_title'] = 'index'
py['index_use_story_template'] = True
VERSION:
0.2
TODO:
- use a template instead of hard-coded HTML
Copyright 2006 Ryan Barrett
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
"""
import math
import os.path
import time
from Pyblosxom import tools
import Pyblosxom.entries
__author__ = 'Ryan Barrett'
__version__ = '0.2'
__url__ = 'http://snarfed.org/space/pyblosxom+index'
__description__ = 'Displays an alphabetical index of all entries.'
def verify_installation(request):
return 1
def cb_filelist(args):
request = args['request']
http = request.getHttp()
data = request.getData()
config = request.getConfiguration()
trigger = config.get('index_trigger', 'site-index')
if http['PATH_INFO'] != trigger:
return
# get the entries
datadir = config['datadir']
files = tools.Walk(request, datadir)
files.sort()
# sort into sections, one for each letter. the dictionary is
# letter => (entry name, path) where path is the relative to datadir.
sections = {}
entry_extensions = data['extensions'].keys()
for file in files:
assert file.startswith(datadir)
path, ext = os.path.splitext(file[len(datadir):])
if ext[1:] in entry_extensions: # strip the leading period from ext
entry_name = os.path.basename(path)
sections.setdefault(entry_name[0].upper(), []).append((entry_name, path))
# extract the first letters. sort as usual, except that numbers and other
# non-letters go *after* letters.
def letters_before_symbols(a, b):
if a.isalpha() and not b.isalpha():
return -1
elif not a.isalpha() and b.isalpha():
return 1
else:
return cmp(a, b)
letters = sections.keys()
if config.get('index_letters_first', 1):
letters.sort(letters_before_symbols)
else:
letters.sort()
# add the header with links to each section
body = '\n \n\n'
# add the sections themselves, with one link per entry, in a table. the
# number of columns is taken from the index_num_columns config variable.
# entries are ordered down each column, in order.
num_cols = config.get('index_num_columns', 2)
for l in letters:
body += '%s \n' % (l, l)
body += '\n'
entries = sections[l]
entries.sort()
num_rows = int(math.ceil(float(len(entries)) / num_cols))
for row in range(0, num_rows):
# alternate the tags' class between index-row-stripe-0 and
# index-row-stripe-1, so you can use CSS to alternate their color for
# readability, if you want.
body += ' \n' % (row % 2)
for col in range(0, num_cols):
entry_index = col * num_rows + row
if entry_index < len(entries):
entry_name, path = entries[entry_index]
else:
entry_name = path = ''
body += '%s | \n' % (path[1:], entry_name)
body += ' \n'
body += ' \n \n\n'
data = {'title': config.get('index_title', 'index')}
# use the epoch for mtime. otherwise, pyblosxom uses the current time, which
# makes other plugins (like weblogsping) think this is a new entry.
epoch = time.localtime(0)
fe = Pyblosxom.entries.base.generate_entry(request, data, body, epoch)
return [fe]
def cb_story(args):
request = args['request']
http = request.getHttp()
config = request.getConfiguration()
trigger = config.get('index_trigger', 'site-index')
if (http['PATH_INFO'] == trigger and
not config.get('index_use_story_template', 1)):
title = config.get('index_title', 'index')
args['template'] = '%s\n \n$body' % title
return args
# -*- coding: utf-8 -*-
from Pyblosxom import tools
import os, os.path, posix, re, stat, time
FILETIME = re.compile('^([0-9]{4})-([0-1][0-9])-([0-3][0-9])-([0-2][0-9])-([0-5][0-9]) +(.*)$')
all_timestamps = {}
extensions = []
timestamps_to_save = {}
#mode: python; indent-tabs-mode: t, tab-width: 4
"""
This allows the user to create a file "timestamps" in their datadir,
that will override the timestamp of any given blog entry. Each line
in this file should be of the form "YYYY-MM-DD-hh-mm file-name".
Then for any entry that one of these lines exist the system will use
that timestamp instead of the actual files modification time.
Note: the filename is relative to your data-dir.
Example of a line for the file /var/data-dir/school/abc.txt
where the datadir is "/var/data-dir/" and the date is Aug 9, 2004.
2004-08-09-00-00 school/abc.txt
"""
__author__ = 'Nathan Kent Bullock, Ryan Barrett'
__homepage__ = 'http://snarfed.org/space/hardcodedates'
__email__ = 'nathan_kent_bullock -at- yahoo.ca, hardcodedates -at- ryanb.org'
__version__ = '1.4'
def init(request):
if all_timestamps:
return # already initialized
datadir = request.getConfiguration()['datadir']
timestamp_file = os.path.join(datadir, 'timestamps')
if os.path.isfile(timestamp_file):
f = file(timestamp_file)
for str in f.readlines():
m = FILETIME.search(str.strip())
if m:
year = int(m.group(1))
mo = int(m.group(2))
day = int(m.group(3))
hr = int(m.group(4))
minute = int(m.group(5))
mtime = time.mktime((year,mo,day,hr,minute,0,0,0,-1))
filename = os.path.join(datadir, m.group(6))
all_timestamps[filename] = mtime
f.close()
extensions.extend(request.getData()['extensions'].keys())
extensions.append(request.getConfiguration().get('comment_ext', 'cmt'))
def cb_filestat(args):
request = args['request']
init(request)
filename = args['filename']
extension = os.path.splitext(filename)[1][1:]
datadir = request.getConfiguration()['datadir']
if all_timestamps.has_key(filename):
# we know this file's timestamp
mtime = args['mtime']
assert isinstance(mtime, (tuple, posix.stat_result))
args['mtime'] = (mtime[0:stat.ST_MTIME] + (all_timestamps[filename],) +
mtime[stat.ST_MTIME + 1:])
elif extension in extensions and filename.startswith(datadir):
# we don't know it, but we should. ask the os for it, and remember it.
args['mtime'] = os.stat(filename)
all_timestamps[filename] = args['mtime'][stat.ST_MTIME]
timestamps_to_save[filename] = args['mtime'][stat.ST_MTIME]
return args
def cb_end(args):
if timestamps_to_save:
datadir = args['request'].getConfiguration()['datadir']
datadir = os.path.normpath(datadir)
tsfile = file(os.path.join(datadir, 'timestamps'), 'a')
for filename, mtime in timestamps_to_save.items():
time_str = time.strftime('%Y-%m-%d-%H-%M', time.localtime(mtime))
# strip the datadir prefix and directory separator slash
filename = filename[len(datadir) + 1:]
tsfile.write('%s %s\n' % (time_str, filename))
tools.getLogger().info('Saved mtime %s for %s' % (time_str, filename))
tsfile.close()
timestamps_to_save.clear()
# -*- coding: utf-8 -*-
"""
category_static.py
- just fixed for pyblosxom-cmd staticrender export all .html links
base index.py from:
http://snarfed.org/space/pyblosxom+index
Ryan Barrett
- Zoom.Quiet 100401
This plugin displays an alphabetical index of all entries. It uses these
optional config variables from config.py, shown here with their defaults:
py['cindex_trigger'] = '/site-index'
py['cindex_num_columns'] = 2
py['cindex_letters_first'] = True
py['cindex_title'] = 'index'
py['cindex_use_story_template'] = True
VERSION:
0.2
TODO:
- use a template instead of hard-coded HTML
Copyright 2006 Ryan Barrett
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
"""
import math
import os.path
import time
from operator import itemgetter
from Pyblosxom import tools
import Pyblosxom.entries
#import re
__author__ = 'Zoom.Quiet '
__version__ = '11.09.7'
__url__ = "http://blog.zoomquiet.org/pyblosxom/techic/PyBlosxom/plugins/category_static.html"
__description__ = 'Displays an Category index as Tree for all entries. in one page'
DEFAULT_ROOT = []
def verify_installation(request):
return 1
def cb_filelist(args):
request = args['request']
http = request.getHttp()
data = request.getData()
config = request.getConfiguration()
_baseurl = config.get("base_url", "")
trigger = config.get('cindex_trigger', 'site-index')
if http['PATH_INFO'] != trigger:
return
# get the entries
datadir = config['datadir']
files = tools.Walk(request, datadir)
files.sort()
body = ''
#print files
# sort into sections, one for each letter. the dictionary is
# letter => (entry name, path) where path is the relative to datadir.
#sections = {}
# the entrise dictionary is
# path => (entry name, 0)
entrise = []
entry_extensions = data['extensions'].keys()
for file in files:
#objEntry = entries.fileentry.FileEntry(request, file,datadir)
#print objEntry.keys()
assert file.startswith(datadir)
path, ext = os.path.splitext(file[len(datadir):])
if ext[1:] in entry_extensions: # strip the leading period from ext
entry_name = os.path.basename(path)
#sections.setdefault(entry_name[0].upper(), []).append((entry_name, path))
entrise.append((entry_name,path,file))
#print entrise
#sortPaths = sorted(entrise.iteritems(), key=itemgetter(1), reverse=True)
#print sortPaths
#print _baseurl
etree = {}
"""{
"pathID":[(path.split()),"title",...]
,
}
"""
for entry in entrise:
e = Pyblosxom.entries.fileentry.FileEntry(request, entry[2], entry[1])
deeps = entry[1].split("/")[:-1]
pathID = "".join(deeps)
if pathID in etree:
etree[pathID].append((e['title'],entry[1]))
else:
etree[pathID]= [tuple(deeps),(e['title'],entry[1])]
#print etree.keys()
root_path_list = config.get("category_root_list", DEFAULT_ROOT)
root_entry_list = []
for opath in root_path_list:
#print opath
crtRoot = []
for k in etree.keys():
if opath in k:
crtRoot.append(k)
crtRoot.sort()
root_entry_list.append((opath,crtRoot))
'''root_entry_list as::
[('Zen', ['ZenChinese', 'ZenGoogle', 'Zenpythonic']), ('oss', ['oss', 'ossFreeBSD', 'ossMozillaFireFox', 'ossUbuntu']), ('opening', []), ('mind', ['mind']), ('Quiet', ['Quietliving', 'Quietnomeans']), ('utility', ['utilitySubversion', 'utilitySubversionhooks', 'utilitypy4strStructuredText', 'utilitypy4webDjango', 'utilitypy4webMoinMoin', 'utilitypy4webQuixote', 'utilitypy4zh', 'utilityzqlib']), ('internet', ['internet', 'internetFolksonomy']), ('easy', ['easymovie', 'easymusic']), ('techic', ['techic', 'techicEmacs', 'techicPyBlosxom', 'techicPyBlosxomblosxom', 'techicPyBlosxomplugins'])]
'''
body += ' /'
for e in etree[''][1:]:
#print "etree[''] include::",e
body += ' %s%s\n'%(
"...."
,_baseurl
,e[1]
,e[0]
)
#print root_entry_list
for k in root_entry_list:
#['techic', 'techicEmacs', 'techicPyBlosxom', 'techicPyBlosxomblosxom', 'techicPyBlosxomplugins']
body += ' %s/'%k[0]
cpath = ""
for p in k[1]:
#print etree[p]
#[('', 'Zen', 'Chinese'), '9.18', 'CC Salon BJ', '\xe2\x80\x9c\xe5\x9b\xbd\xe9\x99\x85\xe8\x87\xaa\xe7\x94\xb1\xe8\xbd\xaf\xe4\xbb\xb6\xe6\x97\xa5\xe2\x80\x9d\xe4\xb9\x8b\xe5\xa4\xb4\xe8\x84\x91\xe9\xa3\x8e\xe6\x9a\xb4', '\xe8\xa1\xa8\xe5\xbd\xa2\xe7\xa0\x81\xe7\x9a\x84\xe6\xb6\x88\xe4\xba\xa1\xe8\x83\x8c\xe6\x99\xaf\xef\xbc\x81']
epath = "/".join(etree[p][0][2:])
if k[0] != "".join(etree[p][0]):
if cpath != epath:
cpath = epath
ldeep = len(etree[p][0][1:])
if 3 > ldeep:
body += ' %s/'%"/".join(etree[p][0][2:])
else:
body += ' %s/'%"/".join(etree[p][0][3:])
for e in etree[p][1:]:
body += ' %s%s\n'%(
"/".join(etree[p][0])
,"..."*len(etree[p][0])
,_baseurl
,e[1]
,e[0]
)
'''
[('', 'easy', 'movie'), '\xe4\xb8\x96\xe9\x97\xb4\xe5\xae\x89\xe5\xbe\x97\xe5\x8f\x8c\xe5\x85\xa8\xe6\xb3\x95,\xe4\xb8\x8d\xe8\xb4\x9f\xe5\xa6\x82\xe6\x9d\xa5\xe4\xb8\x8d\xe8\xb4\x9f\xe5\x8d\xbf!']
[('', 'easy', 'music'), 'ZARD\xe6\xb6\x88\xe9\x80\x9d\xe4\xba\x86']
for entry in entrise:
e = Pyblosxom.entries.fileentry.FileEntry(request, entry[2], entry[1])
#print e['title']
#print entry[1].split("/")[:-1]
body += ' %s%s%s \n'%(
"...."*len(entry[1].split("/"))
,_baseurl
,entry[1]
,e['title'] #entry[0]
,entry[1]
)
'''
#print body
body +=" "
data = {'title': config.get('cindex_title', 'index')}
# use the epoch for mtime. otherwise, pyblosxom uses the current time, which
# makes other plugins (like weblogsping) think this is a new entry.
epoch = time.localtime(0)
fe = Pyblosxom.entries.base.generate_entry(request, data, body, epoch)
return [fe]
def cb_story(args):
request = args['request']
http = request.getHttp()
config = request.getConfiguration()
trigger = config.get('cindex_trigger', 'site-index')
if (http['PATH_INFO'] == trigger and
not config.get('cindex_use_story_template', 1)):
title = config.get('cindex_title', 'index')
args['template'] = '%s\n \n$body' % title
return args
什么事儿呢?
- 在一高压力服务环境中,需要加速系统的响应
- 现行系统对于数据查询要尝试三种数据库源:
- memcache
- redis
- MySQL
- 期望查询加速至少 300%
- 运行环境
-
客户机:
- 俺的Laptop HP 520
- 双核CPU 2G内存
- Ubuntu 9.10
DB主机:
- CentOS 5.0
- 单核CPU 4G 内存
这事儿,一想,简单哪,都放内存就好的哪
- 统计了一下需要用来查询的数据不到2千万条
- 折算成文件不过1G
- 加载到内存中,使用 Python 字典结构的话,也最多涨一倍,也完全可以接受
- 速度?!
没有想到,加速,只要不断将代码住短里面写就好!
不知道什么是 IOP?
- 最直接的实现
-
- 从redis 读
- 生成 dict 对象
- 以pickle dump 出序列化文件
- 用pickle load 加载成dict对象
- 代码:
-
#!/usr/bin/python
# -*- coding: utf-8 -*-
import struct,sys,time
import cPickle as pickle
import redis
REVERSION = "r2d.py v10.5.7"
def _push2dict(dictall,key,smembers):
dictall[struct.pack('I',int(key[1:]))]=[s.split("|") for s in rb.smembers(k)]
return dictall
if __name__ == '__main__': # this way the module can be
if 3 != len(sys.argv):
print """ %s usage::
$ python r2d.py redistIP limitnumber [like 10000]
""" % REVERSION
else:
hostIP = sys.argv[1]
limitn = sys.argv[2]
rb = redis.Redis(host=hostIP, port=6379, db=9)
rbkeys = rb.keys().split()
loop = int(limitn)
s4dict={}
for k in rbkeys:
if 0 == loop:
break
else:
loop -=1
_push2dict(s4dict,k,rb.smembers(k))
pickle.dump(s4dict, open('r4d.dump', 'wb'))
- 代码足够简单了,单函式,20行
- 速度测试::
- 10W 值对导出 >170秒,导入>4秒;
- 100W 值对导出 >500秒,导入>25秒;
- 1000W 值对导出 失败! 内存提前耗光!
这完全无法接受哪...
- 加速尝试::
-
- 根据 IOP 尽量不用函式,将那个一行函式清除,代码填回循环 ~ 立即获得几秒的加速
- 本来用的就是cPickle 了,模块效率没有办法了
- 嗯嗯嗯,可以不用 cPickle卟?
# 使用 str() 将字典对象用文本的方式记入 .py
vdf = open("r2d.define.py","wa")
vdf.write("s4dict=")
vdf.write(str(s4dict))
vdf.close()
# 使用时直接 import 就好
- 改进后测试::
- 10W 值对导出 >160秒,导入>3秒;
- 100W 值对导出 >400秒,导入>19秒;
- 1000W 值对导出 依然杯具
- 加速不明显:
- 进一步观察到,导出时内存飞速增长:
- 100W级别,要食掉1.6G左右的内存
- redis 本身也很占内存,千万级别时,也要占上G (当然这和条目数量/内容有关)
- 这对于服务器是个不可接受的方式
- 改进中间数据格式:
-
- 从redis 读
- 生成 中间log文件
- 导入成 字典对象
因为有这些现实:
- 想输出不论 pickle 或是 .py 的字典对象,都得先在内存中构造出这一对象
- 随着字典对象的规模增加,必然导致这一构建过程的时间加长
经沈游侠提醒,发现字典对象其实是可以线性输出的:
- 比如说,字典结构如:
{key:[(v1,v2,v3),..]
,...
}
- 那么,就可以通过中间数据文本:
('key', [(v1, 'v2', v3)])
...
- 进行线性加载,e.g:
for l in open("r2d.define.py.log","r").readlines():
dd = eval(l)
if dd[0] in s4d:
s4d[dd[0]].append((dd[1],dd[2],dd[3]))
else:
s4d[dd[0]]=[(dd[1],dd[2],dd[3])]
快速修订对应行动代码:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import struct,sys,time
import redis
REVERSION = "r2d.py v10.5.8"
if __name__ == '__main__': # this way the module can be
if 3 != len(sys.argv):
print """ %s usage::
$ python r2d.py redistIP limitnumber [like 10000] > mid-data.log
""" % REVERSION
else:
hostIP = sys.argv[1]
limitn = sys.argv[2]
rb = redis.Redis(host=hostIP, port=6379, db=9)
rbkeys = rb.keys().split()
loop = int(limitn)
for k in rbkeys:
if 0 == loop:
break
else:
loop -=1
dictkey = struct.pack('I',int(k[1:]))
sli = []
for s in rb.smembers(k):
rli = s.split("|")[:3]
if rli:
rli[0] = int(rli[0])
rli[1] = struct.pack('I',int(rli[1]))
rli[2] = int(rli[2])
sli.append(tuple(rli))
print >> d2f,`dictkey,sli`
#注意: `obj` 等同 repr(obj)
代码依然简单了,无函式,30行
- 速度测试::
- 10W 值对导出 >150秒,导入>10秒;
- 100W 值对导出 >500秒,导入>20秒;
- 1000W 值对导出 >1300秒,导入>90秒;
- 速度看起来没有什么明显的提高
- 不过!
- 千万级别的数据可以在低配置环境中跑完了!
- 内存占用很稳定永远90M左右,不会随字典对象的增长而增长!
嗯嗯嗯,这算是可用了...
- 继续改进:
-
- 放弃 redis 直接从MySQL 读
- 生成 中间日志
- 导入成 dict 文件
虽然redis 是号称最快的 K/V 数据库产品,但是,明显就是它将整个业务响应速度拖慢了..
- 为什么呢?
-
- 服务器程序和本地程序面对的环境是不同的
- 高压力服务器程序和小压力服务器程序也是不同的
- 简单来说:
- 小型服务~=每秒<C60
- 中型服务~=每秒<C600
- 大型服务~=每秒>C1000
- 面对的矛盾是完全不同的:
- 小型服务->语言执行效率
- 中型服务->框架执行效率
- 大型服务->I/O 响应速度
- 所以,对于面向Web 的查询服务,不论 Redis/MySQL 对于业务系统,都是进程间通讯!
- 每次跨进程通讯,都意味着至少四次I/O操作!
- 所以,当前 Redis->log->内存字典的转换流程,其实包含了 MySQL->Redis 的进程操作
- 另外:
- 遍查 Redis 文档,居然没有
iterkeys() 类似的操作!
- 每次不论转换多少 Redis 的值对,都得使用
keys() 将键先取出来然后再逐一匹配处理
- 怪不得使用 Redis 测试用小仓库(包含20万值对)时,脚本运行速度和使用全数据Redis(千万值对)时速度要相差5倍以上!内存也占用多几倍!
所以!要直接从 MySQL 相关表中读取
配合一SQL 模板:
-- _tpl/all_black.tpl
SELECT v1,v2,v3,v4,id FROM t_black LIMIT %(limitMAX)s;
核心代码:
#!/usr/bin/python
# -*- coding: utf-8 -*-
REVERSION = "m2d.py v10.5.9"
import struct,sys,time
if __name__ == '__main__': # this way the module can be
"""usage:
$ python m2d.py limit [such as 100] |\
mysql -h xx.xx.xx.xx -u User -D --password=*** |\
python m2d.py > m2d.log
"""
if sys.stdin:
if 1 < len(sys.argv):
limit = sys.argv[1]
limitMAX = int(limit)
print >> sys.stderr, info
print open("_tpl/all_black.tpl","r").read()%locals()
else:
s4dict={}
virusname={}
totalN = 0
for l in sys.stdin.readlines():
if "id" in l:
pass
else:
totalN += 1
lkv=l.split()
dictkey = struct.pack('I',int(lkv[0]))
lkv[1] = int(lkv[1])
lkv[2] = struct.pack('I',int(lkv[2]))
lkv[3] = int(lkv[3])
print >> d2f,`dictkey,tuple(lkv[1:])`
- 速度测试::
- 0.1W 值对导出 <5秒,导入<1秒;
- 10W 值对导出 >300秒,导入>10秒;
- 100W 值对导出 杯具鸟
- 1000W 值对导出 杯具鸟
- 速度怎么和查询的条目多少有关? 而且一大MySQL 就拒绝服务?
嗯嗯嗯,FT! 当然了,MySQL 请求响应时间是有限制的,大数据传输肯定是有问题的,
利用 LIMIT 的切片!
- 改造前述
m2d.py v10.5.9 SQL生成部分的代码:
step = 5000.0
limitMAX = int(limit)
sqltpl = open("_tpl/all_black.tpl","r").read()
if 1 >= limitMAXb/step:
print sqltpl%locals()
else:
for i in range(int(limitMAX/step)):
limitMAX = "%d,%d"%(step*i,int(step))
print sqltpl%locals()
- 以 5000 为界限,生成类似
LIMIT 15000,5000 的限制
- 速度测试::
- 100W 值对导出 >290秒,导入>190秒;
- 1000W 值对导出 杯具鸟
FT!怎么回事儿?速度意外的慢!
- 冷静后改进:
-
- 从MySQL 读使用id 进行限制切分
- 生成 中间日志,但是重新设计结构
- 导入成 dict 文件
- MySQL 为什么这么慢?!
-
- 嗯嗯嗯,N久没有使用过DB,这次重温了为什么远离DB的恶体验
- MySQL 本身就不是千万级别的产品,虽然有很多招可以应对
- 不过,以上招都不好使,咨询了一下同事,直接使用ID范畴来限定就好!
SQL 模板配合改进:
-- _tpl/all_black.tpl
SELECT v1,v2,v3,v4,id FROM t_black WHERE id>%(LIMbwID)s AND id<=%(MAXbwID)s;
SQL 生成代码:
step = 3000.0
offset = int(step)
# < <gen_sql_with_max> > 使用Leo 时可以定义子节点将成堆代码变成语义标记
for l in sys.stdin.readlines():
if "max(id)" not in l:
amount = l.split()
MAXbwID = int(amount[0]
MINbwID = int(amount[1]
MAXbwA = MAXbID-MINbID
MAXbMAX = MAXbwID
lastID = 0
sqltpl = open("_tpl/all_black.tpl","r").read()
for i in range(MAXbwA/offset):
MAXbwID,LIMbwID = (MAXbMAX-offset*i,MAXbMAX-offset*(i+1))
print sqltpl%locals()
lastID = LIMbID
MAXbID,LIMbID = lastID,MINbID # 将切片限数之内的尾数个ID也查询出来
print sqltpl%locals()
- 速度测试::
- 100W 值对导出 >280秒,导入>120秒;
- 1000W 值对导出 >1300秒,导入>780秒;
进一步的,发现业务其实可以将双层,两次查询优化成一次查询的!
- 简单的将输出字典的数据结构变成:
{key:v,...
}
- 将原先的2个值结合原先的key 变成键,就成为了全局唯一的key
- 即和MySQL 每行数据完成一一对应
- 速度再测试::
- 100W 值对导出 >250秒,导入>100秒;
- 1000W 值对导出 >1100秒,导入>480秒;
- 哗! 而且加载完后,字典对象所点内存体积也同样减少了 30% !
- 综上:
-
- 应用 IOP 方面的主要招术:
- 表用 函式
- 表用 模块
- 尽量使用 OS 的标准 I/O 进行功能串接
- DOP~
Data Oriented Programming
- 面向数据编程
- 归根到底,程序都得操作数据解决问题
- 服务器端,大并发压力时,最有效的节省I/O 的方式,就是高效方式
- 压缩输入/出的数据量自然是最好的代码!
嗯嗯嗯?!怎么说到最后好象也没有加速到100倍哪?
- 是也乎,是也乎,以上分享的是内存化字典数据的转换脚本加速过程
- 但是最后要加速的是整个查询业务哈?
- 因为涉及公司核心服务,代码就无法展示了
- 不过,现实是:
- 使用了以上 IOP 中提及的基础技巧
- 每查询业务的处理时间从原先的 0.* 秒,加速到 0.00*秒,至少100倍
- 而对应的代码从 几千行,精简到几十行
所以,基本上可以这么理解:
- 将代码每精简一倍体积
- 运行速度就有望提高10倍
不相信? 尝试一哈噻...
- t2t渲染:: 2010-10-09 02:21:36
- 动力源自::txt2tags
|