MMFFsp3_loc Optimization - ProkopHapala/FireCore GitHub Wiki
- Tested on Computer in the offcie
GPU
run_ocl_opt(nSys=20|iPara=2) NOT-CONVERGED in50steps time 12.8106 [ms]( 256.212 [us/step]) bGridFF=1 iSysFMax=8 dovdW=1
Kenrlel for Bonded intersctions with maximal localization of variables
__attribute__((reqd_work_group_size(1,1,1)))
__kernel void getMMFFf4(
const int4 nDOFs, // 1 (nAtoms,nnode)
// Dynamical
__global float4* apos, // 2 [natoms]
__global float4* fapos, // 3 [natoms]
__global float4* fneigh, // 4 [nnode*4*2]
// parameters
__global int4* neighs, // 5 [nnode] neighboring atoms
__global int4* neighCell, // 5 [nnode] neighboring atoms
__global float4* REQKs, // 6 [natoms] non-boding parametes {R0,E0,Q}
__global float4* apars, // 7 [nnode] per atom forcefield parametrs {c0ss,Kss,c0sp}
__global float4* bLs, // 8 [nnode] bond lengths for each neighbor
__global float4* bKs, // 9 [nnode] bond stiffness for each neighbor
__global float4* Ksp, // 10 [nnode] stiffness of pi-alignment for each neighbor
__global float4* Kpp, // 11 [nnode] stiffness of pi-planarization for each neighbor
__global cl_Mat3* lvecs, // 12
__global cl_Mat3* ilvecs, // 13
__global float4* pbc_shifts,
const int npbc,
const int bSubtractVdW
){
const int iG = get_global_id (0); // intex of atom
const int iS = get_global_id (1); // index of system
//const int nG = get_global_size(0);
//const int nS = get_global_size(1); // number of systems
//const int iL = get_local_id (0);
//const int nL = get_local_size(0);
const int nAtoms=nDOFs.x;
const int nnode =nDOFs.y;
//const int nvec = nAtoms+nnode;
const int i0a = iS*nAtoms;
const int i0n = iS*nnode;
const int i0v = iS*(nAtoms+nnode);
const int iaa = iG + i0a;
const int ian = iG + i0n;
const int iav = iG + i0v;
#define NNEIGH 4
// ---- Dynamical
float4 hs [4]; // direction vectors of bonds
float3 fbs[4]; // force on neighbor sigma
float3 fps[4]; // force on neighbor pi
float3 fa = float3Zero; // force on center position
float E=0;
// ---- Params
const int4 ng = neighs[iaa];
const float3 pa = apos[iav].xyz;
const float4 par = apars[ian]; // (xy=s0_ss,z=ssK,w=piC0 )
// Temp Arrays
const int* ings = (int* )&ng;
const float ssC0 = par.x*par.x - par.y*par.y; // cos(2x) = cos(x)^2 - sin(x)^2, because we store cos(ang0/2) to use in evalAngleCosHalf
for(int i=0; i<NNEIGH; i++){ fbs[i]=float3Zero; fps[i]=float3Zero; }
float3 f1,f2; // temporary forces
{ // ========= BONDS
float3 fpi = float3Zero; // force on pi orbital
const int4 ngC = neighCell[iaa];
const float3 hpi = apos[iav+nAtoms].xyz;
const float4 vbL = bLs[ian];
const float4 vbK = bKs[ian];
const float4 vKs = Ksp[ian];
const float4 vKp = Kpp[ian];
const int* ingC = (int* )&ngC;
const float* bL = (float*)&vbL;
const float* bK = (float*)&vbK;
const float* Kspi = (float*)&vKs;
const float* Kppi = (float*)&vKp;
const int ipbc0 = iS*npbc;
for(int i=0; i<NNEIGH; i++){
float4 h;
const int ing = ings[i];
const int ingv = ing+i0v;
const int inga = ing+i0a;
if(ing<0) break;
h.xyz = apos[ingv].xyz - pa;
{ // PBC shifts
int ic = ingC[i];
h.xyz += pbc_shifts[ipbc0+ic].xyz;
}
float l = length(h.xyz);
h.w = 1./l;
h.xyz *= h.w;
hs[i] = h;
float epp = 0;
float esp = 0;
if(iG<ing){
E+= evalBond( h.xyz, l-bL[i], bK[i], &f1 ); fbs[i]-=f1; fa+=f1;
float kpp = Kppi[i];
if( (ing<nnode) && (kpp>1.e-6) ){ // Only node atoms have pi-pi alignemnt interaction
epp += evalPiAling( hpi, apos[ingv+nAtoms].xyz, kpp, &f1, &f2 ); fpi+=f1; fps[i]+=f2; // pi-alignment (konjugation)
E+=epp;
}
}
// pi-sigma
float ksp = Kspi[i];
if(ksp>1.e-6){
esp += evalAngCos( (float4){hpi,1.}, h, ksp, par.w, &f1, &f2 ); fpi+=f1; fa-=f2; fbs[i]+=f2; // pi-planarization (orthogonality)
E+=epp;
}
}
// --- Store Pi-forces
const int i4p=(iG + iS*nnode*2 )*4 + nnode*4;
for(int i=0; i<NNEIGH; i++){
fneigh[i4p+i] = (float4){fps[i],0};
}
fapos[iav+nAtoms] = (float4){fpi,0};
}
{ // ============== Angles
for(int i=0; i<NNEIGH; i++){
int ing = ings[i];
if(ing<0) break;
const float4 hi = hs[i];
const int ingv = ing+i0v;
const int inga = ing+i0a;
for(int j=i+1; j<NNEIGH; j++){
int jng = ings[j];
if(jng<0) break;
const int jngv = jng+i0v;
const int jnga = jng+i0a;
const float4 hj = hs[j];
E += evalAngleCosHalf( hi, hj, par.xy, par.z, &f1, &f2 );
fa -= f1+f2;
//if(bSubtractVdW)
{ // Remove vdW
float4 REQi=REQKs[inga]; // ToDo: can be optimized
float4 REQj=REQKs[jnga];
float4 REQij;
REQij.x = REQi.x + REQj.x;
REQij.yz = REQi.yz * REQj.yz;
float3 dp = (hj.xyz/hj.w) - (hi.xyz/hi.w);
float4 fij = getLJQH( dp, REQij, 1.0f );
f1 -= fij.xyz;
f2 += fij.xyz;
}
fbs[i]+= f1;
fbs[j]+= f2;
}
}
}
// ========= Save results
const int i4 =(iG + iS*nnode*2 )*4;
//const int i4p=i4+nnode*4;
for(int i=0; i<NNEIGH; i++){
fneigh[i4 +i] = (float4){fbs[i],0};
//fneigh[i4p+i] = (float4){fps[i],0};
}
//fapos[iav ] = (float4){fa ,0}; // If we do run it as first forcefield
fapos[iav ] += (float4){fa ,0}; // If we not run it as first forcefield
//fapos[iav+nAtoms] = (float4){fpi,0};
}
Kenrlel for Non-Bonded intersctions without using aliasses
__attribute__((reqd_work_group_size(32,1,1)))
__kernel void getNonBond(
const int4 ns, // 1
// Dynamical
__global float4* atoms, // 2
__global float4* forces, // 3
// Parameters
__global float4* REQKs, // 4
__global int4* neighs, // 5
__global int4* neighCell, // 6
__global cl_Mat3* lvecs, // 7
const int4 nPBC, // 8
const float4 GFFParams // 9
){
__local float4 LATOMS[32];
__local float4 LCLJS [32];
const bool bPBC = (nPBC.x+nPBC.y+nPBC.z)>0;
const int4 ng = neighs [ get_global_id(0) + get_global_id(1)*ns.x ];
const int4 ngC = neighCell[ get_global_id(0) + get_global_id(1)*ns.x ];
const float4 REQKi = REQKs [ get_global_id(0) + get_global_id(1)*ns.x ];
const float3 posi = atoms [ get_global_id(0) + get_global_id(1)*(ns.x+ns.y) ].xyz;
const float R2damp = GFFParams.x*GFFParams.x;
float4 fe = float4Zero;
const cl_Mat3 lvec = lvecs[ get_global_id(1) ];
const float3 shift0 = lvec.a.xyz*-nPBC.x + lvec.b.xyz*-nPBC.y + lvec.c.xyz*-nPBC.z;
const float3 shift_a = lvec.b.xyz + lvec.a.xyz*(nPBC.x*-2.f-1.f);
const float3 shift_b = lvec.c.xyz + lvec.b.xyz*(nPBC.y*-2.f-1.f);
// ========= Atom-to-Atom interaction ( N-body problem )
for (int j0=0; j0<get_global_size(0); j0+=get_local_size(0) ){
const int i=j0+get_local_id(0);
if(i<ns.x){
LATOMS[get_local_id(0)] = atoms [ i + get_global_id(1)*(ns.x+ns.y) ];
LCLJS [get_local_id(0)] = REQKs [ i + get_global_id(1)*ns.x ];
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int jl=0; jl<get_local_size(0); jl++){
const int ja=j0+jl;
if( (ja!= get_global_id(0) ) && (ja<ns.x) ){ // ToDo: Should interact withhimself in PBC ?
const float4 aj = LATOMS[jl];
float4 REQK = LCLJS [jl];
float3 dp = aj.xyz - posi;
REQK.x +=REQKi.x;
REQK.yz *=REQKi.yz;
const bool bBonded = ((ja==ng.x)||(ja==ng.y)||(ja==ng.z)||(ja==ng.w));
if(bPBC){
int ipbc=0;
dp += shift0;
for(int iy=0; iy<3; iy++){
for(int ix=0; ix<3; ix++){
if( !( bBonded &&(
((ja==ng.x)&&(ipbc==ngC.x))
||((ja==ng.y)&&(ipbc==ngC.y))
||((ja==ng.z)&&(ipbc==ngC.z))
||((ja==ng.w)&&(ipbc==ngC.w))
))){
float4 fij = getLJQH( dp, REQK, R2damp );
fe += fij;
}
ipbc++;
dp += lvec.a.xyz;
}
dp += shift_a;
}
}else
if( !bBonded ){
fe += getLJQH( dp, REQK, R2damp );
}
}
}
}
if( get_global_id(0) < ns.x ){
forces[ get_global_id(0) + get_global_id(1)*(ns.x+ns.y) ] = fe; // If we do run it as first forcefield
}
CPU
- Tested with
MolWorld_sp3::run_omp()
with vdW switched offrun_omp( 500, opt.dt, 1e-6, 1000.0 );
ORIGINAL run_omp() NOT CONVERGED in 500/500 E=0.731637 |F|=0.000622025 time= 2.86888 [ms]( 5.73777 [us/500 iter])
SCOPED-VARS run_omp() NOT CONVERGED in 500/500 E=0.731634 |F|=0.000214906 time= 2.85973 [ms]( 5.71946 [us/500 iter])
FIX_NO_IFS run_omp() NOT CONVERGED in 500/500 E=0.731634 |F|=0.000244495 time= 2.81679 [ms]( 5.63359 [us/500 iter])
FIX_NO_IFS:
double eval_atom_opt(const int ia){
double E=0;
const Vec3d pa = apos [ia];
Vec3d fa = Vec3dZero;
const int* ings = neighs[ia].array;
Vec3d* fbs = fneigh +ia*4;
const Quat4d& apar = apars[ia];
const double piC0 = apar.w;
Quat4d hs[4];
Vec3d f1,f2;
// ========================= Bonds
{ // Bonds
const Vec3d hpi = pipos[ia];
Vec3d fpi = Vec3dZero;
const int* ingC = neighCell[ia].array;
const double* bK = bKs [ia].array;
const double* bL = bLs [ia].array;
const double* Kspi = Ksp [ia].array;
const double* Kppi = Kpp [ia].array;
Vec3d* fps = fneighpi +ia*4;
for(int i=0; i<4; i++){ fbs[i]=Vec3dZero; fps[i]=Vec3dZero; } // we initialize it here because of the break
for(int i=0; i<4; i++){
const int ing = ings[i];
if(ing<0) break;
const Vec3d pi = apos[ing];
Quat4d h;
h.f.set_sub( pi, pa );
const int ipbc = ingC[i];
h.f.add( shifts[ipbc] );
const double l = h.f.normalize();
h.e = 1/l;
hs [i] = h;
if(ia<ing){ // we should avoid double counting because otherwise node atoms would be computed 2x, but capping only once
E+= evalBond( h.f, l-bL[i], bK[i], f1 ); fbs[i].sub(f1); fa.add(f1);
const double kpp = Kppi[i];
if( (ing<nnode) && (kpp>1e-6) ){ // Only node atoms have pi-pi alignemnt interaction
E += evalPiAling( hpi, pipos[ing], 1., 1., kpp, f1, f2 ); fpi.add(f1); fps[i].add(f2);
}
}
const double ksp = Kspi[i];
if( (ksp>1e-6) ){
E += evalAngleCos( hpi, h.f , 1., h.e, ksp, piC0, f1, f2 ); fpi.add(f1); fa.sub(f2); fbs[i].add(f2);
}
}
fpipos[ia]=fpi;
} // Bonds
// ========================= Angles
const double R2damp=Rdamp*Rdamp;
if(doAngles){
const double ssK = apar.z;
const Vec2d cs0_ss = Vec2d{apar.x,apar.y};
const double ssC0 = cs0_ss.x*cs0_ss.x - cs0_ss.y*cs0_ss.y; // cos(2x) = cos(x)^2 - sin(x)^2, because we store cos(ang0/2) to use in evalAngleCosHalf
int iang=0;
for(int i=0; i<3; i++){
int ing = ings[i];
if(ing<0) break;
const Quat4d& hi = hs[i];
for(int j=i+1; j<4; j++){
int jng = ings[j];
if(jng<0) break;
const Quat4d& hj = hs[j];
E += evalAngleCosHalf( hi.f, hj.f, hi.e, hj.e, cs0_ss, ssK, f1, f2 );
fa .sub( f1+f2 );
fbs[i].add( f1 );
fbs[j].add( f2 );
}
}
} // doAngles
fapos [ia]=fa;
return E;
}