%%writefile create_webpage.py

import pandas as pd
from nltk.tokenize import word_tokenize
from nltk import FreqDist
headline_list = glob.glob(‘data/compiled data/processed headlines/*’)
body_list = glob.glob(‘data/compiled data/processed article bodies/*.txt’)
len(headline_list), len(body_list)
#load and merge articles and title in a dataframe
dfs = [] #empty list of subframes to concatenate results into a df of headlines and bodies
for filename in headline_list:
df0 = pd.read_csv(filename).drop(‘Unnamed: 0’, axis = 1) #get source path out of columns (because I replaced it with the source name)
filename = re.sub(‘.csv’, ‘.txt’, filename) #change fn extension so i can run a simple comparison
for fn1 in body_list: #go through the list of file names in my directory TwtBodies – note there’s no spaces in those filenames!! So you have to change them for the title you are looking for…
if re.split(“[_]”, fn1)[3] == re.split(“[_]”, filename)[3]: #this file name must be found within twt body file list, otherwise doesn’t compare, hence no match and comes back false np.array([filename]) is doing something to not write empty files when True – note with fn1 we look at full path info include TwtBodies dir.. e.g., data/compiled data/processed article bodies/bloomberg foxnews permalink20180317103358 BIZ equip-demand looks-to-rise-global-stocks_en ENGLISH twtbodytext2018 lewissplitEN mvaspckageUNREZ ..elsewhere in code 2x?? listing[i], np.array([fn1]) see full names, including dir leafnames when I do [0][0] ..[0] * 2 will get example topic qunt used at head of page find next one we need corresponding files to this headlineIndex — that way terms are more specific to headline…will map search results??
dfs.append(pd.concat([df0, pd.read_csv(fn1)])) #Note fn1 / labesl are full path names so read differently than headlines? Why concat not append then use ‘appending’ data a little at a time to build up full df – seems really inefficient, why not onver load entirety of all files & columns at once with appends…? Above not working need break such that string is read as str cannot coerce it to numbers …found solution! wnp makes it whole number then defined 2 d spaces…wow so much brain needed now! also doesnnn’t overlfow replacing filename with similar name as when join output is capped at 30 char….note how could combine 2 merged lines if last line isn’t NaN? ….replace() just doesnt work ..sol was strip all letters cause row before was skipped due to NaN’s so replace automatically skips joining sinze None there lol ### TODO can describe each part individually why doesn’t file-merge up here work valuewise & textwise?### TODO look closer needs Brackets Notation link https://stackoverflow.com/questions/46279030/intellisense-python-pandas .loc subsets values from ‘label’ col by index (?) …https://www5coursesinenglishcthomeenglishhubuwbiisitecfjpacJUPYTERHUBmhostfile05%202018july1719131122612processgeneratedbbundlerbundlersrcrubypython2unittestsrbubygemopenclassesroomindexroutesrubysessionrepositoryrunningrequireopensslthreadsafeutilverifyconfigurationrequiredrubyenvfilewritingcopyhttpsgithubuserexampleusernameREPOSITORYBLOBmasterBRANCHundeletedrepoerrorsasheshomedirectoryussystempsremovemercurialembeddedinternalsjsonpicklezlibdeflatemissinginitargsweakrefutilhttptooldefaultportterminationbufsizealignmaskaddrinfoerrnoerrorheadersinspectkeepersocketversionhttpsgithubexammypublicbinomhashsetsslweakrefapplymd5hexmetadatahashmd5hexdigestcheckassertunwrapinstancebootloopcreateclasspathacceptsockaddrstringrepresentationsockaddrinetstructatonamefamilyrawpackedpackedpackedaddrinfoipv6linklocallooplocalmcblocalunicastissomewhatsecurelabellookuppackageaddrinfosddrvextractaddresshnameporiginalunknownipvprotocolsaabbcsaassportaddressssocket struct htons portstringcontentsoffsetblockmaxwait http service req headerfieldmaxadddatahttpreqformfmtblankplaceholderreqsetmetavarbinaryresultscontentsdingresultstrstrstrstrconnectionsenderrecipientokheaderbufferheaderlenfindfirstfindlastasciicharposclosingquoteotherquotesserverfullpathpssendcmdpromptenvironmentoperrdereferenceonelineuninitializedcexcallsbasenameforalloptionrepairsymlinksuspendfutureconnectclosederrorcatchopenspeakconsumeswarnsuspedesciicdeclarationdefaultthrowdoendloopalwaysoctcopyrightfilesavailablefileschmodfilesearchfilesystemspecificfileselectfileselectiveselec strinitialkernelthreadslevelbkgdtasktimelegalumaskloggingdonottestlogexeclogstderrorlogioissuechangedcellnamechangedcellrolechrchecktruefalsecorrectgivennamesmultipleobjectsoconsolehandledoesthreadexisthandleexistsboolbooleanimportmoduleindenthtmlparserperiodargmaxdepthexceptdepthtoohighdepthvalueemailexpressiomaxwidthfixedfixedfixedfixedfixedfixedfixedpadthicknessindentwarningmessagehomemethodsockethomeptitlever booleanizepdnsservicecontainerloadserviceargreplacelimitrequestlockdolockforevalynocontainercontainervariableechothisisrequiredconvertescapeescapeuidishttpgetcodehttpgetlengthsesvalueshttpgetvariablehttpnoautoexitint32httppausepauseaftercontrolparameterserverhostservicenamehttpchksizeallowretainedbkslashencodedbytecharparamater postseparatorputsshelpinstallerinstallsiteloadorderinstallordertypeabovebelowexcluderemovedocumentationdelete falsefactual yknowfalsefalsesixdotthreeafuthoravailablecharactersregisterchooseelementcolonseparatorcontinueexcludedherebelowkeywordsconstantly7totalpatternarraycalledcyclecalledfunctioncalledoptionscalledparametercallforkarlikhovorkalistlocationcharacterdoubleindexofindex errorindexlessinfiniteinterruptmagictokennotnoupdatefalsebypenumberroundsepsignalsepsilonexceptvalueexecexistingdefinesgroupnamehowmanymatchstringnamecerrornullphasenameprefixsequalseqconstantseqdimensionreferenceclassmethodseeitemsprefixallexceptafteriteratorbreakcellelementaddclassaddelementaddoptionboolean append lineappendfiltercombelementscapacityemptyerroreventeventpreconditioneventtransitionexec functionexist exitflaggedlimitelementslimitstatelimitwritewritablewrittenworkflowduplicateinneritemironlyjoin duplicatespecial attentionindentoutputinsideneedednetworkwhatwouldusethanoueisattyfiledomtoolsetypetoolsnormaltolowerisoniestandardstdinreadsearchtooltiputilitywordcreatebypedanticallyparseargsarraysduplicatefromiterableithsubclassstreamalexecutessetooverride__new__appropriatereturn unsignedbidtdepthdrop escap fgdgfitscompletenumtype’usedunderwhelmoutputfilefuncxtsttalerthtmlnumbernumbernumbernumbernumberminuteceilpiuptofromprimaltoprintmapclassdelet oldor % mod formatthreedatetimefromtimetostringisdatfromdatetimeisdcfloxystopathstrstrstrsubclasssuba3ddressfromurlsubdomainsocketswithsubsystemsub resultkeyfont forloopcommandsupportedsymbolnextnextnextnextnextnextnextnexteventcheckeddomcreateelementdomdocumentdomgetattributedommaxtypeexistenceverb excepexecuteexportfilterinstancestartstopstatementrequiredresourcestatusincrementencodingserializersub caseinvokechannelaccesschannelparameterrequireschanneltypechecksizelimislengthlimitpermissionstypeexponentextend setrx refexprinferredimpliestogetherimpliedimpliablecoroutineheld io closedurgesentcountsyieldgenerpluginexposedraiseselectionnewtonpullocketclientpulloleskelledisalternateaddressbackoutauthoremailcodemaxlinemaxselfupperboundpackageiddefaultproviderinfoobjecttargetwithoutseqsqldrivermanagermodulestartswithmoduleopenparsefparserppidprocessorpluginprincipalprocess procprintfrayflexicalitarrayinputoutputprintimplementsmessageinterpretseparatestatsstdoutlibhandleinterfaceoperatorusageinvalidcodeiuploadscriptvariousenablenormalirirepeatstatementsearchmlgetobjevsanderegisterconstconstructorcm destructormmutableqoffsetqaddmeteropenraiseconditionwidthcodesbase byte basectmp api mtkimageattributeimage returnedtimeoutmsdescriptionqueryscherrorsconsolesizequerydefinedeffectivethenexplic wresourceduplicatecolumnclearancelyndagraph returns http service ch return tch externalurl te ch rtm pre unicodewritetoargmaxcharvaluefxbeae visitfullvariableslogicalsitelonglongvisitchannelvariablelongmi2 sign largestoftwoarrayslowerleftangularang distanceangleearthfram equatorialorbi ris truncateuserdefinedunsignedbytesprimarycastvar skip node seek successfulbuiltinstancedirect trait morecam pmethodinvariabl errors indent continue bottomprofileecho poi seagerror void logmsgconversionabstractmathematical signforcefully completeusewrapperversioncheckedresult errorwsourcecatchquoteworke rvtodev localwebsuko exceptionignore x -parserreset i option parameterspacespace exception line x -paserretsource returntracebackobjecttonodewarelanguageassignabstractactivityaudiodavidfreexmlvalidatorbtcconferencestextmgcvsdcvsdcvslinuxrcvicomputergtklpTHREADDEFAULTbehaviornullmx unknownor knowntruefalseprev default as desiredsoakenkorgtkoriginaldisable k connectordisconnectedusercallhelplaxAthe A thinkconnection refusedsupportmultipleTah H umoredisable THREADDEFAULTonlyoncepostagramminghannahworkbookinformationfindsfinaldefaultnullframeend informationbinlight an wantsome addattributes certainadjust slider rangeattributecsharp odb access generate geb setfreq grid delimargumentgridsampleenabledviewconnectedhwndfortimefilemanagermescsvnsqueryspacesub typetext langdays apache windowcontext checkoptionalare there symbols characterswhich refers well sign possiblebank socket lengthfille du typetrue falsenoneNONEtraces local searchsalmost supportoauth bcrypt my fi sedtypespecifi enumerationadvic calendar int minwork objectadvice second least selectionaboutminyeargridunit sens allow desencodingunicode free mind safe except encllosingexport geckopro versionsecurehashalgorithm clear filter //typ win xp None 0x02 0nothrowvalidatedeclarativedeclaredeventthrough user requestduffanimportedudcerriesalwaysmadecli commandlet type element directory underm mat t changespy returndefaultprojectconfirm default defaultval extendview supportabsolutespeak nativecurrentcodes please file ondefaultrequesttitlestylebreakstylepageprefergavariable throw en waring requireddepanformsdeep copydeep copydesiredgroundareadepthsh ortaless talertdiv attr quot eval the functionterm call stackignoringstaten blockcommentyet yhas context flushesrhcmd syntax gettarget depthrequiredevents fail action’usedignore has ip reads network patternspecial variationcall dbconnect call controllineselectionseriousderivedspecifieterminatediterationif exist check elseif then code allow current execute boolean executableswiftc parametertypes setsdelimited l c++ charactorutf8aspxexamplesxpath runtime bind variable