waf

FORK: waf with some random patches
git clone https://git.neptards.moe/neptards/waf.git
Log | Files | Refs | README

fast_partial.py (15511B)


      1 #! /usr/bin/env python
      2 # encoding: utf-8
      3 # Thomas Nagy, 2017-2018 (ita)
      4 
      5 """
      6 A system for fast partial rebuilds
      7 
      8 Creating a large amount of task objects up front can take some time.
      9 By making a few assumptions, it is possible to avoid posting creating
     10 task objects for targets that are already up-to-date.
     11 
     12 On a silly benchmark the gain observed for 1M tasks can be 5m->10s
     13 for a single file change.
     14 
     15 Usage::
     16 
     17 	def options(opt):
     18 		opt.load('fast_partial')
     19 
     20 Assumptions:
     21 * Start with a clean build (run "waf distclean" after enabling)
     22 * Mostly for C/C++/Fortran targets with link tasks (object-only targets are not handled)
     23   try it in the folder generated by utils/genbench.py
     24 * For full project builds: no --targets and no pruning from subfolders
     25 * The installation phase is ignored
     26 * `use=` dependencies are specified up front even across build groups
     27 * Task generator source files are not obtained from globs
     28 
     29 Implementation details:
     30 * The first layer obtains file timestamps to recalculate file hashes only
     31   when necessary (similar to md5_tstamp); the timestamps are then stored
     32   in a dedicated pickle file
     33 * A second layer associates each task generator to a file set to help
     34   detecting changes. Task generators are to create their tasks only when
     35   the related files have been modified. A specific db file is created
     36   to store such data (5m -> 1m10)
     37 * A third layer binds build context proxies onto task generators, replacing
     38   the default context. While loading data for the full build uses more memory
     39   (4GB -> 9GB), partial builds are then much faster (1m10 -> 13s)
     40 * A fourth layer enables a 2-level cache on file signatures to
     41   reduce the size of the main pickle file (13s -> 10s)
     42 """
     43 
     44 import os
     45 from waflib import Build, Context, Errors, Logs, Task, TaskGen, Utils
     46 from waflib.TaskGen import feature, after_method, taskgen_method
     47 import waflib.Node
     48 
     49 DONE = 0
     50 DIRTY = 1
     51 NEEDED = 2
     52 
     53 SKIPPABLE = ['cshlib', 'cxxshlib', 'cstlib', 'cxxstlib', 'cprogram', 'cxxprogram']
     54 
     55 TSTAMP_DB = '.wafpickle_tstamp_db_file'
     56 
     57 SAVED_ATTRS = 'root node_sigs task_sigs imp_sigs raw_deps node_deps'.split()
     58 
     59 class bld_proxy(object):
     60 	def __init__(self, bld):
     61 		object.__setattr__(self, 'bld', bld)
     62 
     63 		object.__setattr__(self, 'node_class', type('Nod3', (waflib.Node.Node,), {}))
     64 		self.node_class.__module__ = 'waflib.Node'
     65 		self.node_class.ctx = self
     66 
     67 		object.__setattr__(self, 'root', self.node_class('', None))
     68 		for x in SAVED_ATTRS:
     69 			if x != 'root':
     70 				object.__setattr__(self, x, {})
     71 
     72 		self.fix_nodes()
     73 
     74 	def __setattr__(self, name, value):
     75 		bld = object.__getattribute__(self, 'bld')
     76 		setattr(bld, name, value)
     77 
     78 	def __delattr__(self, name):
     79 		bld = object.__getattribute__(self, 'bld')
     80 		delattr(bld, name)
     81 
     82 	def __getattribute__(self, name):
     83 		try:
     84 			return object.__getattribute__(self, name)
     85 		except AttributeError:
     86 			bld = object.__getattribute__(self, 'bld')
     87 			return getattr(bld, name)
     88 
     89 	def __call__(self, *k, **kw):
     90 		return self.bld(*k, **kw)
     91 
     92 	def fix_nodes(self):
     93 		for x in ('srcnode', 'path', 'bldnode'):
     94 			node = self.root.find_dir(getattr(self.bld, x).abspath())
     95 			object.__setattr__(self, x, node)
     96 
     97 	def set_key(self, store_key):
     98 		object.__setattr__(self, 'store_key', store_key)
     99 
    100 	def fix_tg_path(self, *tgs):
    101 		# changing Node objects on task generators is possible
    102 		# yet, all Node objects must belong to the same parent
    103 		for tg in tgs:
    104 			tg.path = self.root.make_node(tg.path.abspath())
    105 
    106 	def restore(self):
    107 		dbfn = os.path.join(self.variant_dir, Context.DBFILE + self.store_key)
    108 		Logs.debug('rev_use: reading %s', dbfn)
    109 		try:
    110 			data = Utils.readf(dbfn, 'rb')
    111 		except (EnvironmentError, EOFError):
    112 			# handle missing file/empty file
    113 			Logs.debug('rev_use: Could not load the build cache %s (missing)', dbfn)
    114 		else:
    115 			try:
    116 				waflib.Node.pickle_lock.acquire()
    117 				waflib.Node.Nod3 = self.node_class
    118 				try:
    119 					data = Build.cPickle.loads(data)
    120 				except Exception as e:
    121 					Logs.debug('rev_use: Could not pickle the build cache %s: %r', dbfn, e)
    122 				else:
    123 					for x in SAVED_ATTRS:
    124 						object.__setattr__(self, x, data.get(x, {}))
    125 			finally:
    126 				waflib.Node.pickle_lock.release()
    127 		self.fix_nodes()
    128 
    129 	def store(self):
    130 		data = {}
    131 		for x in Build.SAVED_ATTRS:
    132 			data[x] = getattr(self, x)
    133 		db = os.path.join(self.variant_dir, Context.DBFILE + self.store_key)
    134 
    135 		with waflib.Node.pickle_lock:
    136 			waflib.Node.Nod3 = self.node_class
    137 			try:
    138 				x = Build.cPickle.dumps(data, Build.PROTOCOL)
    139 			except Build.cPickle.PicklingError:
    140 				root = data['root']
    141 				for node_deps in data['node_deps'].values():
    142 					for idx, node in enumerate(node_deps):
    143 						# there may be more cross-context Node objects to fix,
    144 						# but this should be the main source
    145 						node_deps[idx] = root.find_node(node.abspath())
    146 				x = Build.cPickle.dumps(data, Build.PROTOCOL)
    147 
    148 		Logs.debug('rev_use: storing %s', db)
    149 		Utils.writef(db + '.tmp', x, m='wb')
    150 		try:
    151 			st = os.stat(db)
    152 			os.remove(db)
    153 			if not Utils.is_win32:
    154 				os.chown(db + '.tmp', st.st_uid, st.st_gid)
    155 		except (AttributeError, OSError):
    156 			pass
    157 		os.rename(db + '.tmp', db)
    158 
    159 class bld(Build.BuildContext):
    160 	def __init__(self, **kw):
    161 		super(bld, self).__init__(**kw)
    162 		self.hashes_md5_tstamp = {}
    163 
    164 	def __call__(self, *k, **kw):
    165 		# this is one way of doing it, one could use a task generator method too
    166 		bld = kw['bld'] = bld_proxy(self)
    167 		ret = TaskGen.task_gen(*k, **kw)
    168 		self.task_gen_cache_names = {}
    169 		self.add_to_group(ret, group=kw.get('group'))
    170 		ret.bld = bld
    171 		bld.set_key(ret.path.abspath().replace(os.sep, '') + str(ret.idx))
    172 		return ret
    173 
    174 	def is_dirty(self):
    175 		return True
    176 
    177 	def store_tstamps(self):
    178 		# Called after a build is finished
    179 		# For each task generator, record all files involved in task objects
    180 		# optimization: done only if there was something built
    181 		do_store = False
    182 		try:
    183 			f_deps = self.f_deps
    184 		except AttributeError:
    185 			f_deps = self.f_deps = {}
    186 			self.f_tstamps = {}
    187 
    188 		allfiles = set()
    189 		for g in self.groups:
    190 			for tg in g:
    191 				try:
    192 					staleness = tg.staleness
    193 				except AttributeError:
    194 					staleness = DIRTY
    195 
    196 				if staleness != DIRTY:
    197 					# DONE case: there was nothing built
    198 					# NEEDED case: the tg was brought in because of 'use' propagation
    199 					# but nothing really changed for them, there may be incomplete
    200 					# tasks (object files) and in this case it is best to let the next build
    201 					# figure out if an input/output file changed
    202 					continue
    203 
    204 				do_cache = False
    205 				for tsk in tg.tasks:
    206 					if tsk.hasrun == Task.SUCCESS:
    207 						do_cache = True
    208 						pass
    209 					elif tsk.hasrun == Task.SKIPPED:
    210 						pass
    211 					else:
    212 						# one failed task, clear the cache for this tg
    213 						try:
    214 							del f_deps[(tg.path.abspath(), tg.idx)]
    215 						except KeyError:
    216 							pass
    217 						else:
    218 							# just store the new state because there is a change
    219 							do_store = True
    220 
    221 						# skip the rest because there is no valid cache possible
    222 						break
    223 				else:
    224 					if not do_cache:
    225 						# all skipped, but is there anything in cache?
    226 						try:
    227 							f_deps[(tg.path.abspath(), tg.idx)]
    228 						except KeyError:
    229 							# probably cleared because a wscript file changed
    230 							# store it
    231 							do_cache = True
    232 
    233 					if do_cache:
    234 
    235 						# there was a rebuild, store the data structure too
    236 						tg.bld.store()
    237 
    238 						# all tasks skipped but no cache
    239 						# or a successful task build
    240 						do_store = True
    241 						st = set()
    242 						for tsk in tg.tasks:
    243 							st.update(tsk.inputs)
    244 							st.update(self.node_deps.get(tsk.uid(), []))
    245 
    246 						# TODO do last/when loading the tgs?
    247 						lst = []
    248 						for k in ('wscript', 'wscript_build'):
    249 							n = tg.path.find_node(k)
    250 							if n:
    251 								n.get_bld_sig()
    252 								lst.append(n.abspath())
    253 
    254 						lst.extend(sorted(x.abspath() for x in st))
    255 						allfiles.update(lst)
    256 						f_deps[(tg.path.abspath(), tg.idx)] = lst
    257 
    258 		for x in allfiles:
    259 			# f_tstamps has everything, while md5_tstamp can be relatively empty on partial builds
    260 			self.f_tstamps[x] = self.hashes_md5_tstamp[x][0]
    261 
    262 		if do_store:
    263 			dbfn = os.path.join(self.variant_dir, TSTAMP_DB)
    264 			Logs.debug('rev_use: storing %s', dbfn)
    265 			dbfn_tmp = dbfn + '.tmp'
    266 			x = Build.cPickle.dumps([self.f_tstamps, f_deps], Build.PROTOCOL)
    267 			Utils.writef(dbfn_tmp, x, m='wb')
    268 			os.rename(dbfn_tmp, dbfn)
    269 			Logs.debug('rev_use: stored %s', dbfn)
    270 
    271 	def store(self):
    272 		self.store_tstamps()
    273 		if self.producer.dirty:
    274 			Build.BuildContext.store(self)
    275 
    276 	def compute_needed_tgs(self):
    277 		# assume the 'use' keys are not modified during the build phase
    278 
    279 		dbfn = os.path.join(self.variant_dir, TSTAMP_DB)
    280 		Logs.debug('rev_use: Loading %s', dbfn)
    281 		try:
    282 			data = Utils.readf(dbfn, 'rb')
    283 		except (EnvironmentError, EOFError):
    284 			Logs.debug('rev_use: Could not load the build cache %s (missing)', dbfn)
    285 			self.f_deps = {}
    286 			self.f_tstamps = {}
    287 		else:
    288 			try:
    289 				self.f_tstamps, self.f_deps = Build.cPickle.loads(data)
    290 			except Exception as e:
    291 				Logs.debug('rev_use: Could not pickle the build cache %s: %r', dbfn, e)
    292 				self.f_deps = {}
    293 				self.f_tstamps = {}
    294 			else:
    295 				Logs.debug('rev_use: Loaded %s', dbfn)
    296 
    297 
    298 		# 1. obtain task generators that contain rebuilds
    299 		# 2. obtain the 'use' graph and its dual
    300 		stales = set()
    301 		reverse_use_map = Utils.defaultdict(list)
    302 		use_map = Utils.defaultdict(list)
    303 
    304 		for g in self.groups:
    305 			for tg in g:
    306 				if tg.is_stale():
    307 					stales.add(tg)
    308 
    309 				try:
    310 					lst = tg.use = Utils.to_list(tg.use)
    311 				except AttributeError:
    312 					pass
    313 				else:
    314 					for x in lst:
    315 						try:
    316 							xtg = self.get_tgen_by_name(x)
    317 						except Errors.WafError:
    318 							pass
    319 						else:
    320 							use_map[tg].append(xtg)
    321 							reverse_use_map[xtg].append(tg)
    322 
    323 		Logs.debug('rev_use: found %r stale tgs', len(stales))
    324 
    325 		# 3. dfs to post downstream tg as stale
    326 		visited = set()
    327 		def mark_down(tg):
    328 			if tg in visited:
    329 				return
    330 			visited.add(tg)
    331 			Logs.debug('rev_use: marking down %r as stale', tg.name)
    332 			tg.staleness = DIRTY
    333 			for x in reverse_use_map[tg]:
    334 				mark_down(x)
    335 		for tg in stales:
    336 			mark_down(tg)
    337 
    338 		# 4. dfs to find ancestors tg to mark as needed
    339 		self.needed_tgs = needed_tgs = set()
    340 		def mark_needed(tg):
    341 			if tg in needed_tgs:
    342 				return
    343 			needed_tgs.add(tg)
    344 			if tg.staleness == DONE:
    345 				Logs.debug('rev_use: marking up %r as needed', tg.name)
    346 				tg.staleness = NEEDED
    347 			for x in use_map[tg]:
    348 				mark_needed(x)
    349 		for xx in visited:
    350 			mark_needed(xx)
    351 
    352 		# so we have the whole tg trees to post in the set "needed"
    353 		# load their build trees
    354 		for tg in needed_tgs:
    355 			tg.bld.restore()
    356 			tg.bld.fix_tg_path(tg)
    357 
    358 		# the stale ones should be fully build, while the needed ones
    359 		# may skip a few tasks, see create_compiled_task and apply_link_after below
    360 		Logs.debug('rev_use: amount of needed task gens: %r', len(needed_tgs))
    361 
    362 	def post_group(self):
    363 		# assumption: we can ignore the folder/subfolders cuts
    364 		def tgpost(tg):
    365 			try:
    366 				f = tg.post
    367 			except AttributeError:
    368 				pass
    369 			else:
    370 				f()
    371 
    372 		if not self.targets or self.targets == '*':
    373 			for tg in self.groups[self.current_group]:
    374 				# this can cut quite a lot of tg objects
    375 				if tg in self.needed_tgs:
    376 					tgpost(tg)
    377 		else:
    378 			# default implementation
    379 			return Build.BuildContext.post_group()
    380 
    381 	def get_build_iterator(self):
    382 		if not self.targets or self.targets == '*':
    383 			self.compute_needed_tgs()
    384 		return Build.BuildContext.get_build_iterator(self)
    385 
    386 @taskgen_method
    387 def is_stale(self):
    388 	# assume no globs
    389 	self.staleness = DIRTY
    390 
    391 	# 1. the case of always stale targets
    392 	if getattr(self, 'always_stale', False):
    393 		return True
    394 
    395 	# 2. check if the db file exists
    396 	db = os.path.join(self.bld.variant_dir, Context.DBFILE)
    397 	try:
    398 		dbstat = os.stat(db).st_mtime
    399 	except OSError:
    400 		Logs.debug('rev_use: must post %r because this is a clean build')
    401 		return True
    402 
    403 	# 3.a check if the configuration exists
    404 	cache_node = self.bld.bldnode.find_node('c4che/build.config.py')
    405 	if not cache_node:
    406 		return True
    407 
    408 	# 3.b check if the configuration changed
    409 	if os.stat(cache_node.abspath()).st_mtime > dbstat:
    410 		Logs.debug('rev_use: must post %r because the configuration has changed', self.name)
    411 		return True
    412 
    413 	# 3.c any tstamp data?
    414 	try:
    415 		f_deps = self.bld.f_deps
    416 	except AttributeError:
    417 		Logs.debug('rev_use: must post %r because there is no f_deps', self.name)
    418 		return True
    419 
    420 	# 4. check if this is the first build (no cache)
    421 	try:
    422 		lst = f_deps[(self.path.abspath(), self.idx)]
    423 	except KeyError:
    424 		Logs.debug('rev_use: must post %r because there it has no cached data', self.name)
    425 		return True
    426 
    427 	try:
    428 		cache = self.bld.cache_tstamp_rev_use
    429 	except AttributeError:
    430 		cache = self.bld.cache_tstamp_rev_use = {}
    431 
    432 	# 5. check the timestamp of each dependency files listed is unchanged
    433 	f_tstamps = self.bld.f_tstamps
    434 	for x in lst:
    435 		try:
    436 			old_ts = f_tstamps[x]
    437 		except KeyError:
    438 			Logs.debug('rev_use: must post %r because %r is not in cache', self.name, x)
    439 			return True
    440 
    441 		try:
    442 			try:
    443 				ts = cache[x]
    444 			except KeyError:
    445 				ts = cache[x] = os.stat(x).st_mtime
    446 		except OSError:
    447 			del f_deps[(self.path.abspath(), self.idx)]
    448 			Logs.debug('rev_use: must post %r because %r does not exist anymore', self.name, x)
    449 			return True
    450 		else:
    451 			if ts != old_ts:
    452 				Logs.debug('rev_use: must post %r because the timestamp on %r changed %r %r', self.name, x, old_ts, ts)
    453 				return True
    454 
    455 	self.staleness = DONE
    456 	return False
    457 
    458 @taskgen_method
    459 def create_compiled_task(self, name, node):
    460 	# skip the creation of object files
    461 	# assumption: object-only targets are not skippable
    462 	if self.staleness == NEEDED:
    463 		# only libraries/programs can skip object files
    464 		for x in SKIPPABLE:
    465 			if x in self.features:
    466 				return None
    467 
    468 	out = '%s.%d.o' % (node.name, self.idx)
    469 	task = self.create_task(name, node, node.parent.find_or_declare(out))
    470 	try:
    471 		self.compiled_tasks.append(task)
    472 	except AttributeError:
    473 		self.compiled_tasks = [task]
    474 	return task
    475 
    476 @feature(*SKIPPABLE)
    477 @after_method('apply_link')
    478 def apply_link_after(self):
    479 	# cprogram/cxxprogram might be unnecessary
    480 	if self.staleness != NEEDED:
    481 		return
    482 	for tsk in self.tasks:
    483 		tsk.hasrun = Task.SKIPPED
    484 
    485 def path_from(self, node):
    486 	# handle nodes of distinct types
    487 	if node.ctx is not self.ctx:
    488 		node = self.ctx.root.make_node(node.abspath())
    489 	return self.default_path_from(node)
    490 waflib.Node.Node.default_path_from = waflib.Node.Node.path_from
    491 waflib.Node.Node.path_from = path_from
    492 
    493 def h_file(self):
    494 	# similar to md5_tstamp.py, but with 2-layer cache
    495 	# global_cache for the build context common for all task generators
    496 	# local_cache for the build context proxy (one by task generator)
    497 	#
    498 	# the global cache is not persistent
    499 	# the local cache is persistent and meant for partial builds
    500 	#
    501 	# assume all calls are made from a single thread
    502 	#
    503 	filename = self.abspath()
    504 	st = os.stat(filename)
    505 
    506 	global_cache = self.ctx.bld.hashes_md5_tstamp
    507 	local_cache = self.ctx.hashes_md5_tstamp
    508 
    509 	if filename in global_cache:
    510 		# value already calculated in this build
    511 		cval = global_cache[filename]
    512 
    513 		# the value in global cache is assumed to be calculated once
    514 		# reverifying it could cause task generators
    515 		# to get distinct tstamp values, thus missing rebuilds
    516 		local_cache[filename] = cval
    517 		return cval[1]
    518 
    519 	if filename in local_cache:
    520 		cval = local_cache[filename]
    521 		if cval[0] == st.st_mtime:
    522 			# correct value from a previous build
    523 			# put it in the global cache
    524 			global_cache[filename] = cval
    525 			return cval[1]
    526 
    527 	ret = Utils.h_file(filename)
    528 	local_cache[filename] = global_cache[filename] = (st.st_mtime, ret)
    529 	return ret
    530 waflib.Node.Node.h_file = h_file
    531