Conversation
distml/strategy/ps_strategy.py
Outdated
|
|
||
|
|
||
| class ParameterServerStrategy(BaseStrategy): | ||
| """Strategy that trains a model via collective AllReduce. |
There was a problem hiding this comment.
change this docstring summary ?
distml/strategy/ps_strategy.py
Outdated
| training_operator_cls, | ||
| operator_config=None, | ||
| initialization_hook=None, | ||
| num_workers=1, |
distml/strategy/ps_strategy.py
Outdated
|
|
||
| assert num_ps | ||
| self.num_ps = num_ps | ||
| self.num_workers = num_workers |
There was a problem hiding this comment.
same here... don't use the plural form
| assert num_ps | ||
| self.num_ps = num_ps | ||
| self.num_workers = num_workers | ||
| self.num_cpus_per_server = num_cpus_per_server |
distml/strategy/ps_strategy.py
Outdated
| ray.get([server.set_params.remote(this_shard_ref)]) | ||
|
|
||
| def _start_workers(self): | ||
| """Create worker(actor), maybe need worker group to manager these workers. |
| """ | ||
| # TODO (Hao): infer the per-replica batch size here... | ||
|
|
||
| # so here we get two set of params that will be passed around: |
There was a problem hiding this comment.
you can clean this comment as it is redundent with those I left in AllReduceStrategy
distml/strategy/ps_strategy.py
Outdated
| } | ||
|
|
||
| # Should we make two groups for worker and server? | ||
| self.worker_group = DataParallelGroup(**workergroup_init_args) |
There was a problem hiding this comment.
This is strange. Is this the same DataParallelGroup with the one in AllReduceStrategy?
If yes -- then fine
If not -- is there any way we can share the same class? If it is hard then we should at least use a different class name?
distml/strategy/ps_strategy.py
Outdated
| self.server_group.start_actors( | ||
| self.num_ps) # server at the last num_ps processes. | ||
|
|
||
| worker_rets = self.worker_group.test_connection() |
There was a problem hiding this comment.
Are testing connection necessary? if not, probably move it to DEBUG mode.
|
|
||
| def setup_operator(self): | ||
| # figure out the signature of training_operator_cls later. | ||
| self.training_operator = self.training_operator_cls( |
There was a problem hiding this comment.
I am not sure whether we should setup the whole operator on the server side? One drawback is that this will take a lot of GPU memory?
Uh oh!
There was an error while loading. Please reload this page.