[gmx-users] gromacs, lam and condor

Hsin-Lin Chiang jiangsl at phys.sinica.edu.tw
Sat Apr 3 03:08:29 CEST 2010


Hi,

Do someone use gromacs, lam, and condor together here?
I use gromacs with lam/mpi on condor system.
Everytime I submit the parallel job.
I got the node which is occupied before and the performance of each cpu is below 10%.
How should I change the script?
Below is one submit script and two executable script.

condor_mpi:
----
#!/bin/bash
Universe = parallel
Executable = ./lamscript
machine_count = 2
output = md_$(NODE).out
error = md_$(NODE).err
log = md.log
arguments = /stathome/jiangsl/simulation/gromacs/2OMP/2OMP_1_1/md.sh
+WantIOProxy = True
should_transfer_files = yes
when_to_transfer_output = on_exit
Queue
-------

lamscript:
------- 
#!/bin/sh

_CONDOR_PROCNO=$_CONDOR_PROCNO
_CONDOR_NPROCS=$_CONDOR_NPROCS
_CONDOR_REMOTE_SPOOL_DIR=$_CONDOR_REMOTE_SPOOL_DIR

SSHD_SH=`condor_config_val libexec`
SSHD_SH=$SSHD_SH/sshd.sh

CONDOR_SSH=`condor_config_val libexec`
CONDOR_SSH=$CONDOR_SSH/condor_ssh

# Set this to the bin directory of your lam installation
# This also must be in your .cshrc file, so the remote side
# can find it!
export LAMDIR=/stathome/jiangsl/soft/lam-7.1.4
export PATH=${LAMDIR}/bin:${PATH}
export LD_LIBRARY_PATH=/lib:/usr/lib:$LAMDIR/lib:.:/opt/intel/compilers/lib

. $SSHD_SH $_CONDOR_PROCNO $_CONDOR_NPROCS

# If not the head node, just sleep forever, to let the
# sshds run
if [ $_CONDOR_PROCNO -ne 0 ]
then
                wait
                sshd_cleanup
                exit 0
fi

EXECUTABLE=$1
shift

# the binary is copied but the executable flag is cleared.
# so the script have to take care of this
chmod +x $EXECUTABLE

# to allow multiple lam jobs running on a single machine,
# we have to give somewhat unique value
export LAM_MPI_SESSION_SUFFIX=$$
export LAMRSH=$CONDOR_SSH
# when a job is killed by the user, this script will get sigterm
# This script have to catch it and do the cleaning for the
# lam environment
finalize()
{
sshd_cleanup
lamhalt
exit
}
trap finalize TERM

CONDOR_CONTACT_FILE=$_CONDOR_SCRATCH_DIR/contact
export $CONDOR_CONTACT_FILE
# The second field in the contact file is the machine name
# that condor_ssh knows how to use. Note that this used to
# say "sort -n +0 ...", but -n option is now deprecated.
sort < $CONDOR_CONTACT_FILE | awk '{print $2}' > machines

# start the lam environment
# For older versions of lam you may need to remove the -ssi boot rshline
lamboot -ssi boot rsh -ssi rsh_agent "$LAMRSH -x" machines

if [ $? -ne 0 ]
then
&#160;&#160;&#160;&#160;&#160;&#160;&#160; echo "lamscript error booting lam"
&#160;&#160;&#160;&#160;&#160;&#160;&#160; exit 1
fi

mpirun C -ssi rpi usysv -ssi coll_smp 1 $EXECUTABLE $@ &

CHILD=$!
TMP=130
while [ $TMP -gt 128 ] ; do
&#160;&#160;&#160;&#160;&#160;&#160;&#160; wait $CHILD
&#160;&#160;&#160;&#160;&#160;&#160;&#160; TMP=$?;
done

# clean up files
sshd_cleanup
/bin/rm -f machines

# clean up lam
lamhalt

exit $TMP
----

md.sh
----
#!/bin/sh
#running GROMACS
/stathome/jiangsl/soft/gromacs-4.0.5/bin/mdrun_mpi_d \
-s /stathome/jiangsl/simulation/gromacs/2OMP/2OMP_1_1/md/200ns.tpr \
-e /stathome/jiangsl/simulation/gromacs/2OMP/2OMP_1_1/md/200ns.edr \
-o /stathome/jiangsl/simulation/gromacs/2OMP/2OMP_1_1/md/200ns.trr \
-g /stathome/jiangsl/simulation/gromacs/2OMP/2OMP_1_1/md/200ns.log \
-c /stathome/jiangsl/simulation/gromacs/2OMP/2OMP_1_1/md/200ns.gro
-----

Hsin-Lin
 
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://maillist.sys.kth.se/pipermail/gromacs.org_gmx-users/attachments/20100403/f5dfe9cd/attachment.html>


More information about the gromacs.org_gmx-users mailing list