Extracts minimum repeating string from string variable - lvphj/epydemiology GitHub Wiki

Example 1

df = pd.DataFrame({'OriginalStr':['Value:abcdabcd','defdefxyz','hij hij ']})

print(df)

      OriginalStr
0  Value:abcdabcd
1       defdefxyz
2        hij hij 
df = epy.phjAddColumnOfMinRepeatingString(phjDF = df,
                                          phjColName = 'OriginalStr',
                                          phjNewColName = 'RepeatingStr',
                                          phjPrefixStr = 'Value:',
                                          phjSuffixStr = 'xyz',
                                          phjStripWhiteSpc = True,
                                          phjPrintResults = True)
Returned dataframe
==================
      OriginalStr RepeatingStr
0  Value:abcdabcd         abcd
1       defdefxyz          def
2        hij hij           hij

Example 2

myExampleDF = pd.DataFrame({'somevariable':['Producer: 12/345/6789 Other Mr A N',
                                            'Mixed Batch Mixed Batch ',
                                            ' Another mixed batch Another mixed batch'
                                            '',
                                            'FBO 98/765/4321 Happy Fmrs (Smith) AB1234',
                                            'FBO  KARRO - MALTON UNREAD',
                                            'Market: 56/789/1234 Livestock Sellers 56/789/1234 Livestock Sellers',
                                            'Market: Repeated string Repeated string Repeated string Repeated stringendbit']})

print(myExampleDF)
                                        somevariable
0                 Producer: 12/345/6789 Other Mr A N
1                           Mixed Batch Mixed Batch 
2            Another mixed batch Another mixed batch
3          FBO 98/765/4321 Happy Fmrs (Smith) AB1234
4                         FBO  KARRO - MALTON UNREAD
5  Market: 56/789/1234 Livestock Sellers 56/789/1...
6  Market: Repeated string Repeated string Repeat...
phjExampleDF = epy.phjAddColumnOfMinRepeatingString(phjDF = myExampleDF,
                                                    phjColName = 'somevariable',
                                                    phjNewColName = 'somevariable_norpt',
                                                    phjPrefixStr = 'Market:',
                                                    phjSuffixStr = 'endbit',
                                                    phjReattachAffixes = True,
                                                    phjReduceMultiSpc = True,
                                                    phjStripWhiteSpc = True,
                                                    phjPrintResults = False)

print(phjExampleDF)

                                        somevariable  \
0                 Producer: 12/345/6789 Other Mr A N   
1                           Mixed Batch Mixed Batch    
2            Another mixed batch Another mixed batch   
3          FBO 98/765/4321 Happy Fmrs (Smith) AB1234   
4                         FBO  KARRO - MALTON UNREAD   
5  Market: 56/789/1234 Livestock Sellers 56/789/1...   
6  Market: Repeated string Repeated string Repeat...   

                          somevariable_norpt  
0         Producer: 12/345/6789 Other Mr A N  
1                                Mixed Batch  
2                        Another mixed batch  
3  FBO 98/765/4321 Happy Fmrs (Smith) AB1234  
4                 FBO  KARRO - MALTON UNREAD  
5      Market: 56/789/1234 Livestock Sellers  
6              Market: Repeated stringendbit