MMFFsp3_loc Optimization - ProkopHapala/FireCore GitHub Wiki

Tested on Computer in the offcie

GPU

run_ocl_opt(nSys=20|iPara=2)	NOT-CONVERGED	in50steps	time	12.8106	[ms](	256.212	[us/step])	bGridFF=1	iSysFMax=8	dovdW=1

Kenrlel for Bonded intersctions with maximal localization of variables

__attribute__((reqd_work_group_size(1,1,1)))
__kernel void getMMFFf4(
    const int4 nDOFs,               // 1   (nAtoms,nnode)
    // Dynamical
    __global float4*  apos,         // 2  [natoms]
    __global float4*  fapos,        // 3  [natoms]     
    __global float4*  fneigh,       // 4  [nnode*4*2]
    // parameters
    __global int4*    neighs,       // 5  [nnode]  neighboring atoms
    __global int4*    neighCell,    // 5  [nnode]  neighboring atoms
    __global float4*  REQKs,        // 6  [natoms] non-boding parametes {R0,E0,Q} 
    __global float4*  apars,        // 7  [nnode]  per atom forcefield parametrs {c0ss,Kss,c0sp}
    __global float4*  bLs,          // 8  [nnode]  bond lengths  for each neighbor
    __global float4*  bKs,          // 9  [nnode]  bond stiffness for each neighbor
    __global float4*  Ksp,          // 10 [nnode]  stiffness of pi-alignment for each neighbor
    __global float4*  Kpp,          // 11 [nnode]  stiffness of pi-planarization for each neighbor
    __global cl_Mat3* lvecs,        // 12
    __global cl_Mat3* ilvecs,       // 13
    __global float4*  pbc_shifts,
    const int npbc,
    const int bSubtractVdW
){

    const int iG = get_global_id (0);   // intex of atom
    const int iS = get_global_id (1);   // index of system
    //const int nG = get_global_size(0);
    //const int nS = get_global_size(1);  // number of systems
    //const int iL = get_local_id  (0);
    //const int nL = get_local_size(0);
    const int nAtoms=nDOFs.x;
    const int nnode =nDOFs.y;
    //const int nvec  = nAtoms+nnode;

    const int i0a   = iS*nAtoms; 
    const int i0n   = iS*nnode; 
    const int i0v   = iS*(nAtoms+nnode);
    
    const int iaa = iG + i0a; 
    const int ian = iG + i0n; 
    const int iav = iG + i0v;
    

    #define NNEIGH 4
    
    // ---- Dynamical
    float4  hs [4];              // direction vectors of bonds
    float3  fbs[4];              // force on neighbor sigma
    float3  fps[4];              // force on neighbor pi
    float3  fa  = float3Zero;    // force on center position 

    float E=0;
    // ---- Params
    const int4   ng  = neighs[iaa];    
    const float3 pa  = apos[iav].xyz;
    const float4 par = apars[ian];    //     (xy=s0_ss,z=ssK,w=piC0 )
     

    // Temp Arrays
    const int*   ings  = (int*  )&ng; 


    const float   ssC0   = par.x*par.x - par.y*par.y;   // cos(2x) = cos(x)^2 - sin(x)^2, because we store cos(ang0/2) to use in  evalAngleCosHalf
    for(int i=0; i<NNEIGH; i++){ fbs[i]=float3Zero; fps[i]=float3Zero; }

    float3 f1,f2;         // temporary forces

    { // ========= BONDS

        float3  fpi = float3Zero;    // force on pi orbital
        const int4   ngC = neighCell[iaa];  
        const float3 hpi = apos[iav+nAtoms].xyz; 

        const float4 vbL = bLs[ian];       
        const float4 vbK = bKs[ian];       
        const float4 vKs = Ksp[ian];       
        const float4 vKp = Kpp[ian];  

        const int*   ingC  = (int*  )&ngC; 
        const float* bL    = (float*)&vbL; 
        const float* bK    = (float*)&vbK;
        const float* Kspi  = (float*)&vKs;  
        const float* Kppi  = (float*)&vKp; 

        const int ipbc0 = iS*npbc;

        for(int i=0; i<NNEIGH; i++){
            float4 h;
            const int ing  = ings[i];
            const int ingv = ing+i0v;
            const int inga = ing+i0a;
            if(ing<0) break;
            h.xyz    = apos[ingv].xyz - pa; 
            { // PBC shifts
                int ic  = ingC[i];
                h.xyz  += pbc_shifts[ipbc0+ic].xyz; 
            }
            float  l = length(h.xyz); 
            h.w      = 1./l;
            h.xyz   *= h.w;
            hs[i]    = h;

            float epp = 0;
            float esp = 0;

            if(iG<ing){  
                E+= evalBond( h.xyz, l-bL[i], bK[i], &f1 );  fbs[i]-=f1;  fa+=f1;   
                            
                float kpp = Kppi[i];
                if( (ing<nnode) && (kpp>1.e-6) ){   // Only node atoms have pi-pi alignemnt interaction
                    epp += evalPiAling( hpi, apos[ingv+nAtoms].xyz, kpp,  &f1, &f2 );   fpi+=f1;  fps[i]+=f2;    //   pi-alignment     (konjugation)
                    E+=epp;
                }
            } 
            
            // pi-sigma 
            float ksp = Kspi[i];
            if(ksp>1.e-6){  
                esp += evalAngCos( (float4){hpi,1.}, h, ksp, par.w, &f1, &f2 );   fpi+=f1; fa-=f2;  fbs[i]+=f2;    //   pi-planarization (orthogonality)
                E+=epp;
            }
        }

        // --- Store Pi-forces
        const int i4p=(iG + iS*nnode*2 )*4 + nnode*4;
        for(int i=0; i<NNEIGH; i++){
            fneigh[i4p+i] = (float4){fps[i],0};
        }
        fapos[iav+nAtoms]  = (float4){fpi,0}; 

    }
    
    { //  ============== Angles 

        for(int i=0; i<NNEIGH; i++){
            int ing = ings[i];
            if(ing<0) break;
            const float4 hi = hs[i];
            const int ingv = ing+i0v;
            const int inga = ing+i0a;
            for(int j=i+1; j<NNEIGH; j++){
                int jng  = ings[j];
                if(jng<0) break;
                const int jngv = jng+i0v;
                const int jnga = jng+i0a;
                const float4 hj = hs[j];            
                E += evalAngleCosHalf( hi, hj, par.xy, par.z, &f1, &f2 );            
                fa    -= f1+f2;

                //if(bSubtractVdW)
                { // Remove vdW
                    float4 REQi=REQKs[inga];   // ToDo: can be optimized
                    float4 REQj=REQKs[jnga];
                    float4 REQij;
                    REQij.x  = REQi.x  + REQj.x;
                    REQij.yz = REQi.yz * REQj.yz; 
                    float3 dp = (hj.xyz/hj.w) - (hi.xyz/hi.w); 
                    float4 fij = getLJQH( dp, REQij, 1.0f );
                    f1 -=  fij.xyz;
                    f2 +=  fij.xyz;
                }

                fbs[i]+= f1;
                fbs[j]+= f2;
            }
        }

    }
    
    // ========= Save results
    const int i4 =(iG + iS*nnode*2 )*4;
    //const int i4p=i4+nnode*4;
    for(int i=0; i<NNEIGH; i++){
        fneigh[i4 +i] = (float4){fbs[i],0};
        //fneigh[i4p+i] = (float4){fps[i],0};
    }
    //fapos[iav     ] = (float4){fa ,0}; // If we do  run it as first forcefield
    fapos[iav       ] += (float4){fa ,0};  // If we not run it as first forcefield
    //fapos[iav+nAtoms]  = (float4){fpi,0}; 
    
}

Kenrlel for Non-Bonded intersctions without using aliasses

__attribute__((reqd_work_group_size(32,1,1)))
__kernel void getNonBond(
    const int4 ns,                  // 1
    // Dynamical
    __global float4*  atoms,        // 2
    __global float4*  forces,       // 3
    // Parameters
    __global float4*  REQKs,        // 4
    __global int4*    neighs,       // 5
    __global int4*    neighCell,    // 6
    __global cl_Mat3* lvecs,        // 7
    const int4        nPBC,         // 8
    const float4      GFFParams     // 9
){
    __local float4 LATOMS[32];
    __local float4 LCLJS [32];

    const bool   bPBC = (nPBC.x+nPBC.y+nPBC.z)>0;

    const int4   ng    = neighs   [ get_global_id(0) + get_global_id(1)*ns.x ];
    const int4   ngC   = neighCell[ get_global_id(0) + get_global_id(1)*ns.x ];
    const float4 REQKi = REQKs    [ get_global_id(0) + get_global_id(1)*ns.x ];
    const float3 posi  = atoms    [ get_global_id(0) + get_global_id(1)*(ns.x+ns.y) ].xyz;
    const float  R2damp = GFFParams.x*GFFParams.x;
    float4 fe          = float4Zero;

    const cl_Mat3 lvec = lvecs[ get_global_id(1) ];

    const float3 shift0  = lvec.a.xyz*-nPBC.x + lvec.b.xyz*-nPBC.y + lvec.c.xyz*-nPBC.z;
    const float3 shift_a = lvec.b.xyz + lvec.a.xyz*(nPBC.x*-2.f-1.f);
    const float3 shift_b = lvec.c.xyz + lvec.b.xyz*(nPBC.y*-2.f-1.f);

    // ========= Atom-to-Atom interaction ( N-body problem )  
    for (int j0=0; j0<get_global_size(0); j0+=get_local_size(0) ){
        const int i=j0+get_local_id(0);
        if(i<ns.x){
            LATOMS[get_local_id(0)] = atoms [ i + get_global_id(1)*(ns.x+ns.y) ];
            LCLJS [get_local_id(0)] = REQKs [ i + get_global_id(1)*ns.x ];
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        for (int jl=0; jl<get_local_size(0); jl++){
            const int ja=j0+jl;
            if( (ja!= get_global_id(0) ) && (ja<ns.x) ){   // ToDo: Should interact withhimself in PBC ?
                const float4 aj = LATOMS[jl];
                float4 REQK     = LCLJS [jl];
                float3 dp       = aj.xyz - posi;
                REQK.x  +=REQKi.x;
                REQK.yz *=REQKi.yz;
                const bool bBonded = ((ja==ng.x)||(ja==ng.y)||(ja==ng.z)||(ja==ng.w));

                if(bPBC){    
                    int ipbc=0;
                    dp += shift0;
                    for(int iy=0; iy<3; iy++){
                        for(int ix=0; ix<3; ix++){
                            if( !( bBonded &&(
                                    ((ja==ng.x)&&(ipbc==ngC.x))
                                    ||((ja==ng.y)&&(ipbc==ngC.y))
                                    ||((ja==ng.z)&&(ipbc==ngC.z))
                                    ||((ja==ng.w)&&(ipbc==ngC.w))
                            ))){
                                float4 fij = getLJQH( dp, REQK, R2damp );
                                fe += fij;
                            }
                            ipbc++; 
                            dp    += lvec.a.xyz; 
                        }
                        dp    += shift_a;
                    }
                }else 
                if( !bBonded ){
                    fe += getLJQH( dp, REQK, R2damp );
                }
            }
        }
    }
    if( get_global_id(0) < ns.x ){
        forces[ get_global_id(0) + get_global_id(1)*(ns.x+ns.y) ] = fe;           // If we do  run it as first forcefield 
    }

CPU

Tested with MolWorld_sp3::run_omp() with vdW switched off run_omp( 500, opt.dt, 1e-6, 1000.0 );

ORIGINAL    run_omp()	NOT	CONVERGED	in	500/500	E=0.731637	|F|=0.000622025	time=	2.86888	[ms](	5.73777	[us/500	iter])
SCOPED-VARS run_omp()	NOT	CONVERGED	in	500/500	E=0.731634	|F|=0.000214906	time=	2.85973	[ms](	5.71946	[us/500	iter])
FIX_NO_IFS  run_omp()	NOT	CONVERGED	in	500/500	E=0.731634	|F|=0.000244495	time=	2.81679	[ms](	5.63359	[us/500	iter])

FIX_NO_IFS:

    double eval_atom_opt(const int ia){
        double E=0;
        const Vec3d pa   = apos [ia]; 
        Vec3d       fa   = Vec3dZero;
        const int*  ings = neighs[ia].array;
        Vec3d* fbs  = fneigh   +ia*4;
        const Quat4d& apar  = apars[ia];
        const double  piC0  = apar.w;
        Quat4d  hs[4];
        Vec3d   f1,f2;

        // ========================= Bonds
        { // Bonds
            const Vec3d hpi = pipos[ia]; 
            Vec3d fpi       = Vec3dZero; 
            const int* ingC = neighCell[ia].array;
            const double* bK   = bKs      [ia].array;
            const double* bL   = bLs      [ia].array;
            const double* Kspi = Ksp      [ia].array;
            const double* Kppi = Kpp      [ia].array;
            Vec3d* fps  = fneighpi +ia*4;
            for(int i=0; i<4; i++){ fbs[i]=Vec3dZero; fps[i]=Vec3dZero; } // we initialize it here because of the break
            for(int i=0; i<4; i++){
                const int ing = ings[i];
                if(ing<0) break;
                const Vec3d  pi = apos[ing];
                Quat4d h; 
                h.f.set_sub( pi, pa );
                const int ipbc = ingC[i]; 
                h.f.add( shifts[ipbc] );
                const double l = h.f.normalize();
                h.e    = 1/l;
                hs [i] = h;
                if(ia<ing){   // we should avoid double counting because otherwise node atoms would be computed 2x, but capping only once
                    E+= evalBond( h.f, l-bL[i], bK[i], f1 ); fbs[i].sub(f1);  fa.add(f1);
                    const double kpp = Kppi[i];
                    if( (ing<nnode) && (kpp>1e-6) ){   // Only node atoms have pi-pi alignemnt interaction
                        E += evalPiAling( hpi, pipos[ing], 1., 1.,   kpp,       f1, f2 );   fpi.add(f1);  fps[i].add(f2);    
                    }
                } 
                const double ksp = Kspi[i];
                if( (ksp>1e-6) ){  
                    E += evalAngleCos( hpi, h.f      , 1., h.e, ksp, piC0, f1, f2 );   fpi.add(f1); fa.sub(f2);  fbs[i].add(f2);   
                }            
            }
            fpipos[ia]=fpi;
        } // Bonds


        // ========================= Angles
        const double R2damp=Rdamp*Rdamp;
        if(doAngles){
            const double ssK    = apar.z;
            const Vec2d  cs0_ss = Vec2d{apar.x,apar.y};
            const double ssC0   = cs0_ss.x*cs0_ss.x - cs0_ss.y*cs0_ss.y;   // cos(2x) = cos(x)^2 - sin(x)^2, because we store cos(ang0/2) to use in  evalAngleCosHalf
            int iang=0;
            for(int i=0; i<3; i++){
                int ing = ings[i];
                if(ing<0) break;
                const Quat4d& hi = hs[i];
                for(int j=i+1; j<4; j++){
                    int jng  = ings[j];
                    if(jng<0) break;
                    const Quat4d& hj = hs[j];    
                    E += evalAngleCosHalf( hi.f, hj.f,  hi.e, hj.e,  cs0_ss,  ssK, f1, f2 );
                    fa    .sub( f1+f2  );
                    fbs[i].add( f1     );
                    fbs[j].add( f2     );
                }
            }
        }  // doAngles
        fapos [ia]=fa; 
        return E;   
    }